print("prelim stats")
prelim stats
#import libraries
import csv
import pandas as pd
import numpy as np
#import and read csv file
#display first 10 rows
import pandas as pd
crime_data1 = pd.read_csv(r"C:\Users\radon\Documents\CIND820/crimedata.csv")
crime_data1.head(10)
| communityname | state | countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 39 | 5320 | 1 | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 21.44 | 10.93 | 11.33 | 11980 | 100.0 | 75122 | 89.24 | 1.55 | 70.20 | 23.62 | 1.03 | 18.39 | 79584 | 29711 | 30233 | 13600 | 5725 | 27101 | 5115 | 22838 | 227 | 1.96 | 5.81 | 9.90 | 48.18 | 2.70 | 64.55 | 14.65 | 28.82 | 5.49 | 50.73 | 3.67 | 26.38 | 5.22 | 4.47 | 3.22 | 91.43 | 90.17 | 95.78 | 95.81 | 44.56 | 58.88 | 31 | 0.36 | 1277 | 8.69 | 13.00 | 20.99 | 30.93 | 0.93 | 1.39 | 2.24 | 3.30 | 85.68 | 1.37 | 4.81 | 4.17 | 2.99 | 3.00 | 2.84 | 91.46 | 0.39 | 11.06 | 3 | 64 | 98.37 | 91.01 | 3.12 | 37.50 | 1959 | 0.00 | 0.28 | 215900 | 262600 | 326900 | 111000 | 685 | 1001 | 1001 | 316 | 1001 | 23.8 | 21.1 | 14.0 | 11 | 0 | 10.66 | 53.72 | 65.29 | 78.09 | 89.14 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 6.5 | 1845.9 | 9.63 | ? | ? | ? | ? | 0.00 | ? | 0 | 0.00 | 0 | 0 | 1 | 8.2 | 4 | 32.81 | 14 | 114.85 | 138 | 1132.08 | 16 | 131.26 | 2 | 16.41 | 41.02 | 1394.59 |
| 1 | Marpletownship | PA | 45 | 47616 | 1 | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 21.30 | 10.48 | 17.18 | 23123 | 100.0 | 47917 | 78.99 | 1.11 | 64.11 | 35.50 | 2.75 | 22.85 | 55323 | 20148 | 20191 | 18137 | 0 | 20074 | 5250 | 12222 | 885 | 3.98 | 5.61 | 13.72 | 29.89 | 2.43 | 61.96 | 12.26 | 29.28 | 6.39 | 37.64 | 4.23 | 27.99 | 6.45 | 5.42 | 3.11 | 86.91 | 85.33 | 96.82 | 86.46 | 51.14 | 62.43 | 43 | 0.24 | 1920 | 5.21 | 8.65 | 13.33 | 22.50 | 0.43 | 0.72 | 1.11 | 1.87 | 87.79 | 1.81 | 4.25 | 3.34 | 2.70 | 2.83 | 1.96 | 89.03 | 1.01 | 23.60 | 3 | 240 | 97.15 | 84.88 | 0.00 | 18.33 | 1958 | 0.31 | 0.14 | 136300 | 164200 | 199900 | 63600 | 467 | 560 | 672 | 205 | 627 | 27.6 | 20.7 | 12.5 | 0 | 0 | 8.30 | 77.17 | 71.27 | 90.22 | 96.12 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 10.6 | 2186.7 | 3.84 | ? | ? | ? | ? | 0.00 | ? | 0 | 0.00 | 1 | 4.25 | 5 | 21.26 | 24 | 102.05 | 57 | 242.37 | 376 | 1598.78 | 26 | 110.55 | 1 | 4.25 | 127.56 | 1955.95 |
| 2 | Tigardcity | OR | ? | ? | 1 | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 25.88 | 11.01 | 10.28 | 29344 | 100.0 | 35669 | 82.00 | 1.15 | 55.73 | 22.25 | 2.94 | 14.56 | 42112 | 16946 | 17103 | 16644 | 21606 | 15528 | 5954 | 8405 | 1389 | 4.75 | 2.80 | 9.09 | 30.13 | 4.01 | 69.80 | 15.95 | 21.52 | 8.79 | 32.48 | 10.10 | 25.78 | 14.76 | 12.55 | 2.95 | 78.54 | 78.85 | 92.37 | 75.72 | 66.08 | 74.19 | 164 | 0.88 | 1468 | 16.42 | 23.98 | 32.08 | 35.63 | 0.82 | 1.20 | 1.61 | 1.78 | 93.11 | 1.14 | 2.97 | 2.05 | 2.42 | 2.69 | 2.06 | 64.18 | 2.03 | 47.46 | 3 | 544 | 95.68 | 57.79 | 0.92 | 7.54 | 1976 | 1.55 | 0.12 | 74700 | 90400 | 112000 | 37300 | 370 | 428 | 520 | 150 | 484 | 24.1 | 21.7 | 11.6 | 16 | 0 | 5.00 | 44.77 | 36.60 | 61.26 | 82.85 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 10.6 | 2780.9 | 4.37 | ? | ? | ? | ? | 0.00 | ? | 3 | 8.30 | 6 | 16.6 | 56 | 154.95 | 14 | 38.74 | 274 | 758.14 | 1797 | 4972.19 | 136 | 376.3 | 22 | 60.87 | 218.59 | 6167.51 |
| 3 | Gloversvillecity | NY | 35 | 29443 | 1 | 16656 | 2.40 | 1.70 | 97.35 | 0.50 | 0.70 | 12.55 | 25.20 | 12.19 | 17.57 | 0 | 0.0 | 20580 | 68.15 | 0.24 | 38.95 | 39.48 | 11.71 | 18.33 | 26501 | 10810 | 10909 | 9984 | 4941 | 3541 | 2451 | 4391 | 2831 | 17.23 | 11.05 | 33.68 | 10.81 | 9.86 | 54.74 | 31.22 | 27.43 | 26.76 | 22.71 | 10.98 | 28.15 | 14.47 | 12.91 | 2.98 | 64.02 | 62.36 | 65.38 | 67.43 | 59.59 | 70.27 | 561 | 3.84 | 339 | 13.86 | 13.86 | 15.34 | 15.34 | 0.28 | 0.28 | 0.31 | 0.31 | 94.98 | 0.56 | 3.93 | 2.56 | 2.37 | 2.51 | 2.20 | 58.18 | 1.21 | 45.66 | 3 | 669 | 91.19 | 54.89 | 2.54 | 57.85 | 1939 | 7.00 | 0.87 | 36400 | 49600 | 66500 | 30100 | 195 | 250 | 309 | 114 | 333 | 28.7 | 20.6 | 14.5 | 0 | 0 | 2.04 | 88.71 | 56.70 | 90.17 | 96.24 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 5.2 | 3217.7 | 3.31 | ? | ? | ? | ? | 0.00 | ? | 0 | 0.00 | 10 | 57.86 | 10 | 57.86 | 33 | 190.93 | 225 | 1301.78 | 716 | 4142.56 | 47 | 271.93 | ? | ? | 306.64 | ? |
| 4 | Bemidjicity | MN | 7 | 5068 | 1 | 11245 | 2.76 | 0.53 | 89.16 | 1.17 | 0.52 | 24.46 | 40.53 | 28.69 | 12.65 | 0 | 0.0 | 17390 | 69.33 | 0.55 | 42.82 | 32.16 | 11.21 | 14.43 | 24018 | 8483 | 9009 | 887 | 4425 | 3352 | 3000 | 1328 | 2855 | 29.99 | 12.15 | 23.06 | 25.28 | 9.08 | 52.44 | 6.89 | 36.54 | 10.94 | 27.80 | 7.51 | 50.66 | 11.64 | 9.73 | 2.98 | 58.59 | 55.20 | 66.51 | 79.17 | 61.22 | 68.94 | 402 | 4.70 | 196 | 46.94 | 56.12 | 67.86 | 69.90 | 0.82 | 0.98 | 1.18 | 1.22 | 94.64 | 0.39 | 5.23 | 3.11 | 2.35 | 2.55 | 2.12 | 58.13 | 2.94 | 55.64 | 2 | 333 | 92.45 | 53.57 | 3.90 | 42.64 | 1958 | 7.45 | 0.82 | 30600 | 43200 | 59500 | 28900 | 202 | 283 | 362 | 160 | 332 | 32.2 | 23.2 | 12.9 | 2 | 0 | 1.74 | 73.75 | 42.22 | 60.34 | 89.02 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 11.5 | 974.2 | 0.38 | ? | ? | ? | ? | 0.00 | ? | 0 | 0.00 | ? | ? | 4 | 32.04 | 14 | 112.14 | 91 | 728.93 | 1060 | 8490.87 | 91 | 728.93 | 5 | 40.05 | ? | 9988.79 |
| 5 | Springfieldcity | MO | ? | ? | 1 | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 32.89 | 20.04 | 13.26 | 140494 | 100.0 | 21577 | 75.78 | 1.00 | 41.15 | 29.31 | 7.12 | 14.09 | 27705 | 11878 | 12029 | 7382 | 10264 | 10753 | 7192 | 8104 | 23223 | 17.78 | 8.76 | 23.03 | 20.66 | 5.72 | 59.02 | 14.31 | 26.83 | 14.72 | 23.42 | 11.40 | 33.32 | 14.46 | 13.04 | 2.89 | 71.94 | 69.79 | 79.76 | 75.33 | 62.96 | 70.52 | 1511 | 1.58 | 2091 | 21.33 | 30.56 | 38.02 | 45.48 | 0.32 | 0.45 | 0.57 | 0.68 | 96.87 | 0.60 | 3.08 | 1.92 | 2.28 | 2.37 | 2.16 | 57.81 | 2.11 | 53.19 | 2 | 5119 | 91.81 | 55.50 | 2.09 | 26.22 | 1966 | 6.13 | 0.31 | 37700 | 53900 | 73100 | 35400 | 215 | 280 | 349 | 134 | 340 | 26.4 | 17.3 | 11.7 | 327 | 4 | 1.49 | 64.35 | 42.29 | 70.61 | 85.66 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 70.4 | 1995.7 | 0.97 | ? | ? | ? | ? | 0.00 | ? | 7 | 4.63 | 77 | 50.98 | 136 | 90.05 | 449 | 297.29 | 2094 | 1386.46 | 7690 | 5091.64 | 454 | 300.6 | 134 | 88.72 | 442.95 | 6867.42 |
| 6 | Norwoodtown | MA | 21 | 50250 | 1 | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 27.41 | 12.76 | 14.42 | 28700 | 100.0 | 42805 | 79.47 | 0.39 | 47.70 | 30.23 | 5.41 | 17.23 | 50394 | 18193 | 18276 | 17342 | 21482 | 12639 | 21852 | 22594 | 1126 | 4.01 | 4.49 | 13.89 | 27.01 | 4.85 | 65.42 | 14.02 | 27.17 | 8.50 | 32.78 | 5.97 | 36.05 | 9.06 | 7.64 | 3.14 | 79.53 | 79.76 | 92.05 | 77.12 | 65.16 | 72.81 | 263 | 1.18 | 2637 | 11.38 | 16.27 | 23.93 | 27.76 | 1.05 | 1.49 | 2.20 | 2.55 | 89.98 | 0.60 | 5.08 | 3.46 | 2.55 | 2.89 | 2.09 | 64.62 | 1.47 | 47.35 | 3 | 566 | 95.11 | 56.96 | 1.41 | 34.45 | 1956 | 0.69 | 0.28 | 155100 | 179000 | 215500 | 60400 | 463 | 669 | 824 | 361 | 736 | 24.4 | 20.8 | 12.5 | 0 | 0 | 9.19 | 77.30 | 63.45 | 82.23 | 93.53 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 10.9 | 2643.5 | 9.62 | ? | ? | ? | ? | 0.00 | ? | 0 | 0.00 | 4 | 13.53 | 9 | 30.44 | 54 | 182.66 | 110 | 372.09 | 288 | 974.19 | 144 | 487.1 | 17 | 57.5 | 226.63 | 1890.88 |
| 7 | Andersoncity | IN | ? | ? | 1 | 59459 | 2.45 | 14.20 | 84.87 | 0.40 | 0.63 | 15.31 | 27.93 | 14.78 | 14.60 | 59449 | 100.0 | 23221 | 71.60 | 0.67 | 35.74 | 32.58 | 8.81 | 22.59 | 28901 | 12161 | 12599 | 9820 | 6634 | 8802 | 7428 | 6187 | 10320 | 17.98 | 10.09 | 28.67 | 12.00 | 8.19 | 56.59 | 27.00 | 21.54 | 21.92 | 18.02 | 13.28 | 28.34 | 16.33 | 14.94 | 2.95 | 62.56 | 58.70 | 69.89 | 62.76 | 63.08 | 72.44 | 2368 | 4.66 | 517 | 13.15 | 22.82 | 28.24 | 33.08 | 0.11 | 0.20 | 0.25 | 0.29 | 97.43 | 0.28 | 3.85 | 2.55 | 2.36 | 2.42 | 2.27 | 65.29 | 1.90 | 56.30 | 2 | 2051 | 92.22 | 63.82 | 6.39 | 56.36 | 1954 | 8.42 | 0.49 | 26300 | 37000 | 52400 | 26100 | 186 | 253 | 325 | 139 | 338 | 26.3 | 15.1 | 12.2 | 21 | 0 | 0.87 | 73.70 | 54.85 | 85.55 | 91.51 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 39.2 | 1515.3 | 0.70 | ? | ? | ? | ? | 0.00 | ? | 8 | 13.13 | 34 | 55.79 | 98 | 160.8 | 128 | 210.02 | 608 | 997.6 | 2250 | 3691.79 | 125 | 205.1 | 9 | 14.77 | 439.73 | 4909.26 |
| 8 | Fargocity | ND | 17 | 25700 | 1 | 74111 | 2.46 | 0.35 | 97.11 | 1.25 | 0.73 | 16.64 | 35.16 | 20.33 | 8.58 | 74115 | 100.0 | 25326 | 83.69 | 2.93 | 47.11 | 19.30 | 4.21 | 10.31 | 34269 | 13554 | 13727 | 8852 | 5344 | 8011 | 5332 | 5174 | 9603 | 13.68 | 5.52 | 11.27 | 30.24 | 4.18 | 68.51 | 6.89 | 31.55 | 11.37 | 29.43 | 7.29 | 40.87 | 9.94 | 8.64 | 3.00 | 79.35 | 79.70 | 86.60 | 80.70 | 74.32 | 78.51 | 751 | 1.64 | 1474 | 23.68 | 33.58 | 46.68 | 53.93 | 0.47 | 0.67 | 0.93 | 1.07 | 95.21 | 0.43 | 2.59 | 1.54 | 2.32 | 2.77 | 1.91 | 57.42 | 1.67 | 59.32 | 2 | 1562 | 95.07 | 48.10 | 0.45 | 25.61 | 1971 | 2.66 | 0.19 | 54500 | 70300 | 93700 | 39200 | 241 | 321 | 387 | 146 | 355 | 25.2 | 20.7 | 12.8 | 125 | 15 | 1.99 | 58.82 | 40.72 | 67.97 | 81.39 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | 30.9 | 2399.3 | 1.41 | ? | ? | ? | ? | 0.00 | ? | 0 | 0.00 | 35 | 43.87 | 16 | 20.05 | 41 | 51.39 | 425 | 532.66 | 3149 | 3946.71 | 206 | 258.18 | 8 | 10.03 | 115.31 | 4747.58 |
| 9 | Wacocity | TX | ? | ? | 1 | 103590 | 2.62 | 23.14 | 67.60 | 0.92 | 16.35 | 19.88 | 34.55 | 21.62 | 13.12 | 103590 | 100.0 | 17852 | 74.20 | 0.86 | 30.98 | 29.09 | 9.06 | 13.99 | 24058 | 10195 | 12126 | 5715 | 11313 | 5770 | 7320 | 6984 | 27767 | 28.68 | 13.01 | 31.62 | 17.02 | 8.39 | 51.37 | 15.73 | 29.06 | 16.43 | 24.30 | 11.07 | 38.49 | 14.66 | 12.97 | 3.11 | 61.65 | 54.56 | 68.85 | 61.69 | 60.80 | 69.23 | 3537 | 4.71 | 4793 | 15.54 | 23.08 | 35.32 | 49.82 | 0.72 | 1.07 | 1.63 | 2.31 | 85.72 | 2.51 | 6.70 | 4.10 | 2.45 | 2.47 | 2.44 | 46.82 | 6.14 | 59.96 | 2 | 5606 | 87.57 | 46.51 | 5.64 | 37.57 | 1960 | 11.74 | 0.33 | 28600 | 43100 | 67400 | 38800 | 192 | 281 | 369 | 177 | 353 | 29.6 | 19.4 | 13.0 | 43 | 4 | 4.63 | 75.59 | 42.33 | 74.05 | 92.12 | 198 | 183.53 | 187 | 173.33 | 73432 | 68065.1 | 370.9 | 183.5 | 89.32 | 78.28 | 11.11 | 10.61 | 0 | 21.72 | 13 | 12 | 60.2 | 78.5 | 1319.3 | 0.76 | 100 | 9315474 | 94.44 | 10 | 6.57 | 86346.3 | 29 | 26.88 | 141 | 130.69 | 453 | 419.89 | 1043 | 966.77 | 2397 | 2221.81 | 6121 | 5673.63 | 1070 | 991.8 | 18 | 16.68 | 1544.24 | 8903.93 |
#display data types of all columns
datatypes = crime_data1.dtypes
print(datatypes)
#replace all missing values with NaN
import pandas as pd
crime_data = pd.read_csv('crimedata.csv', na_values=['?'])
crime_data.head(10)
| communityname | state | countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 39.0 | 5320.0 | 1 | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 21.44 | 10.93 | 11.33 | 11980 | 100.0 | 75122 | 89.24 | 1.55 | 70.20 | 23.62 | 1.03 | 18.39 | 79584 | 29711 | 30233 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 227 | 1.96 | 5.81 | 9.90 | 48.18 | 2.70 | 64.55 | 14.65 | 28.82 | 5.49 | 50.73 | 3.67 | 26.38 | 5.22 | 4.47 | 3.22 | 91.43 | 90.17 | 95.78 | 95.81 | 44.56 | 58.88 | 31 | 0.36 | 1277 | 8.69 | 13.00 | 20.99 | 30.93 | 0.93 | 1.39 | 2.24 | 3.30 | 85.68 | 1.37 | 4.81 | 4.17 | 2.99 | 3.00 | 2.84 | 91.46 | 0.39 | 11.06 | 3 | 64 | 98.37 | 91.01 | 3.12 | 37.50 | 1959 | 0.00 | 0.28 | 215900 | 262600 | 326900 | 111000 | 685 | 1001 | 1001 | 316 | 1001 | 23.8 | 21.1 | 14.0 | 11 | 0 | 10.66 | 53.72 | 65.29 | 78.09 | 89.14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.5 | 1845.9 | 9.63 | NaN | NaN | NaN | NaN | 0.00 | NaN | 0 | 0.00 | 0.0 | 0.00 | 1.0 | 8.20 | 4.0 | 32.81 | 14.0 | 114.85 | 138.0 | 1132.08 | 16.0 | 131.26 | 2.0 | 16.41 | 41.02 | 1394.59 |
| 1 | Marpletownship | PA | 45.0 | 47616.0 | 1 | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 21.30 | 10.48 | 17.18 | 23123 | 100.0 | 47917 | 78.99 | 1.11 | 64.11 | 35.50 | 2.75 | 22.85 | 55323 | 20148 | 20191 | 18137 | 0 | 20074 | 5250.0 | 12222 | 885 | 3.98 | 5.61 | 13.72 | 29.89 | 2.43 | 61.96 | 12.26 | 29.28 | 6.39 | 37.64 | 4.23 | 27.99 | 6.45 | 5.42 | 3.11 | 86.91 | 85.33 | 96.82 | 86.46 | 51.14 | 62.43 | 43 | 0.24 | 1920 | 5.21 | 8.65 | 13.33 | 22.50 | 0.43 | 0.72 | 1.11 | 1.87 | 87.79 | 1.81 | 4.25 | 3.34 | 2.70 | 2.83 | 1.96 | 89.03 | 1.01 | 23.60 | 3 | 240 | 97.15 | 84.88 | 0.00 | 18.33 | 1958 | 0.31 | 0.14 | 136300 | 164200 | 199900 | 63600 | 467 | 560 | 672 | 205 | 627 | 27.6 | 20.7 | 12.5 | 0 | 0 | 8.30 | 77.17 | 71.27 | 90.22 | 96.12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.6 | 2186.7 | 3.84 | NaN | NaN | NaN | NaN | 0.00 | NaN | 0 | 0.00 | 1.0 | 4.25 | 5.0 | 21.26 | 24.0 | 102.05 | 57.0 | 242.37 | 376.0 | 1598.78 | 26.0 | 110.55 | 1.0 | 4.25 | 127.56 | 1955.95 |
| 2 | Tigardcity | OR | NaN | NaN | 1 | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 25.88 | 11.01 | 10.28 | 29344 | 100.0 | 35669 | 82.00 | 1.15 | 55.73 | 22.25 | 2.94 | 14.56 | 42112 | 16946 | 17103 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 1389 | 4.75 | 2.80 | 9.09 | 30.13 | 4.01 | 69.80 | 15.95 | 21.52 | 8.79 | 32.48 | 10.10 | 25.78 | 14.76 | 12.55 | 2.95 | 78.54 | 78.85 | 92.37 | 75.72 | 66.08 | 74.19 | 164 | 0.88 | 1468 | 16.42 | 23.98 | 32.08 | 35.63 | 0.82 | 1.20 | 1.61 | 1.78 | 93.11 | 1.14 | 2.97 | 2.05 | 2.42 | 2.69 | 2.06 | 64.18 | 2.03 | 47.46 | 3 | 544 | 95.68 | 57.79 | 0.92 | 7.54 | 1976 | 1.55 | 0.12 | 74700 | 90400 | 112000 | 37300 | 370 | 428 | 520 | 150 | 484 | 24.1 | 21.7 | 11.6 | 16 | 0 | 5.00 | 44.77 | 36.60 | 61.26 | 82.85 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.6 | 2780.9 | 4.37 | NaN | NaN | NaN | NaN | 0.00 | NaN | 3 | 8.30 | 6.0 | 16.60 | 56.0 | 154.95 | 14.0 | 38.74 | 274.0 | 758.14 | 1797.0 | 4972.19 | 136.0 | 376.30 | 22.0 | 60.87 | 218.59 | 6167.51 |
| 3 | Gloversvillecity | NY | 35.0 | 29443.0 | 1 | 16656 | 2.40 | 1.70 | 97.35 | 0.50 | 0.70 | 12.55 | 25.20 | 12.19 | 17.57 | 0 | 0.0 | 20580 | 68.15 | 0.24 | 38.95 | 39.48 | 11.71 | 18.33 | 26501 | 10810 | 10909 | 9984 | 4941 | 3541 | 2451.0 | 4391 | 2831 | 17.23 | 11.05 | 33.68 | 10.81 | 9.86 | 54.74 | 31.22 | 27.43 | 26.76 | 22.71 | 10.98 | 28.15 | 14.47 | 12.91 | 2.98 | 64.02 | 62.36 | 65.38 | 67.43 | 59.59 | 70.27 | 561 | 3.84 | 339 | 13.86 | 13.86 | 15.34 | 15.34 | 0.28 | 0.28 | 0.31 | 0.31 | 94.98 | 0.56 | 3.93 | 2.56 | 2.37 | 2.51 | 2.20 | 58.18 | 1.21 | 45.66 | 3 | 669 | 91.19 | 54.89 | 2.54 | 57.85 | 1939 | 7.00 | 0.87 | 36400 | 49600 | 66500 | 30100 | 195 | 250 | 309 | 114 | 333 | 28.7 | 20.6 | 14.5 | 0 | 0 | 2.04 | 88.71 | 56.70 | 90.17 | 96.24 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.2 | 3217.7 | 3.31 | NaN | NaN | NaN | NaN | 0.00 | NaN | 0 | 0.00 | 10.0 | 57.86 | 10.0 | 57.86 | 33.0 | 190.93 | 225.0 | 1301.78 | 716.0 | 4142.56 | 47.0 | 271.93 | NaN | NaN | 306.64 | NaN |
| 4 | Bemidjicity | MN | 7.0 | 5068.0 | 1 | 11245 | 2.76 | 0.53 | 89.16 | 1.17 | 0.52 | 24.46 | 40.53 | 28.69 | 12.65 | 0 | 0.0 | 17390 | 69.33 | 0.55 | 42.82 | 32.16 | 11.21 | 14.43 | 24018 | 8483 | 9009 | 887 | 4425 | 3352 | 3000.0 | 1328 | 2855 | 29.99 | 12.15 | 23.06 | 25.28 | 9.08 | 52.44 | 6.89 | 36.54 | 10.94 | 27.80 | 7.51 | 50.66 | 11.64 | 9.73 | 2.98 | 58.59 | 55.20 | 66.51 | 79.17 | 61.22 | 68.94 | 402 | 4.70 | 196 | 46.94 | 56.12 | 67.86 | 69.90 | 0.82 | 0.98 | 1.18 | 1.22 | 94.64 | 0.39 | 5.23 | 3.11 | 2.35 | 2.55 | 2.12 | 58.13 | 2.94 | 55.64 | 2 | 333 | 92.45 | 53.57 | 3.90 | 42.64 | 1958 | 7.45 | 0.82 | 30600 | 43200 | 59500 | 28900 | 202 | 283 | 362 | 160 | 332 | 32.2 | 23.2 | 12.9 | 2 | 0 | 1.74 | 73.75 | 42.22 | 60.34 | 89.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11.5 | 974.2 | 0.38 | NaN | NaN | NaN | NaN | 0.00 | NaN | 0 | 0.00 | NaN | NaN | 4.0 | 32.04 | 14.0 | 112.14 | 91.0 | 728.93 | 1060.0 | 8490.87 | 91.0 | 728.93 | 5.0 | 40.05 | NaN | 9988.79 |
| 5 | Springfieldcity | MO | NaN | NaN | 1 | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 32.89 | 20.04 | 13.26 | 140494 | 100.0 | 21577 | 75.78 | 1.00 | 41.15 | 29.31 | 7.12 | 14.09 | 27705 | 11878 | 12029 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 23223 | 17.78 | 8.76 | 23.03 | 20.66 | 5.72 | 59.02 | 14.31 | 26.83 | 14.72 | 23.42 | 11.40 | 33.32 | 14.46 | 13.04 | 2.89 | 71.94 | 69.79 | 79.76 | 75.33 | 62.96 | 70.52 | 1511 | 1.58 | 2091 | 21.33 | 30.56 | 38.02 | 45.48 | 0.32 | 0.45 | 0.57 | 0.68 | 96.87 | 0.60 | 3.08 | 1.92 | 2.28 | 2.37 | 2.16 | 57.81 | 2.11 | 53.19 | 2 | 5119 | 91.81 | 55.50 | 2.09 | 26.22 | 1966 | 6.13 | 0.31 | 37700 | 53900 | 73100 | 35400 | 215 | 280 | 349 | 134 | 340 | 26.4 | 17.3 | 11.7 | 327 | 4 | 1.49 | 64.35 | 42.29 | 70.61 | 85.66 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 70.4 | 1995.7 | 0.97 | NaN | NaN | NaN | NaN | 0.00 | NaN | 7 | 4.63 | 77.0 | 50.98 | 136.0 | 90.05 | 449.0 | 297.29 | 2094.0 | 1386.46 | 7690.0 | 5091.64 | 454.0 | 300.60 | 134.0 | 88.72 | 442.95 | 6867.42 |
| 6 | Norwoodtown | MA | 21.0 | 50250.0 | 1 | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 27.41 | 12.76 | 14.42 | 28700 | 100.0 | 42805 | 79.47 | 0.39 | 47.70 | 30.23 | 5.41 | 17.23 | 50394 | 18193 | 18276 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 1126 | 4.01 | 4.49 | 13.89 | 27.01 | 4.85 | 65.42 | 14.02 | 27.17 | 8.50 | 32.78 | 5.97 | 36.05 | 9.06 | 7.64 | 3.14 | 79.53 | 79.76 | 92.05 | 77.12 | 65.16 | 72.81 | 263 | 1.18 | 2637 | 11.38 | 16.27 | 23.93 | 27.76 | 1.05 | 1.49 | 2.20 | 2.55 | 89.98 | 0.60 | 5.08 | 3.46 | 2.55 | 2.89 | 2.09 | 64.62 | 1.47 | 47.35 | 3 | 566 | 95.11 | 56.96 | 1.41 | 34.45 | 1956 | 0.69 | 0.28 | 155100 | 179000 | 215500 | 60400 | 463 | 669 | 824 | 361 | 736 | 24.4 | 20.8 | 12.5 | 0 | 0 | 9.19 | 77.30 | 63.45 | 82.23 | 93.53 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.9 | 2643.5 | 9.62 | NaN | NaN | NaN | NaN | 0.00 | NaN | 0 | 0.00 | 4.0 | 13.53 | 9.0 | 30.44 | 54.0 | 182.66 | 110.0 | 372.09 | 288.0 | 974.19 | 144.0 | 487.10 | 17.0 | 57.50 | 226.63 | 1890.88 |
| 7 | Andersoncity | IN | NaN | NaN | 1 | 59459 | 2.45 | 14.20 | 84.87 | 0.40 | 0.63 | 15.31 | 27.93 | 14.78 | 14.60 | 59449 | 100.0 | 23221 | 71.60 | 0.67 | 35.74 | 32.58 | 8.81 | 22.59 | 28901 | 12161 | 12599 | 9820 | 6634 | 8802 | 7428.0 | 6187 | 10320 | 17.98 | 10.09 | 28.67 | 12.00 | 8.19 | 56.59 | 27.00 | 21.54 | 21.92 | 18.02 | 13.28 | 28.34 | 16.33 | 14.94 | 2.95 | 62.56 | 58.70 | 69.89 | 62.76 | 63.08 | 72.44 | 2368 | 4.66 | 517 | 13.15 | 22.82 | 28.24 | 33.08 | 0.11 | 0.20 | 0.25 | 0.29 | 97.43 | 0.28 | 3.85 | 2.55 | 2.36 | 2.42 | 2.27 | 65.29 | 1.90 | 56.30 | 2 | 2051 | 92.22 | 63.82 | 6.39 | 56.36 | 1954 | 8.42 | 0.49 | 26300 | 37000 | 52400 | 26100 | 186 | 253 | 325 | 139 | 338 | 26.3 | 15.1 | 12.2 | 21 | 0 | 0.87 | 73.70 | 54.85 | 85.55 | 91.51 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 39.2 | 1515.3 | 0.70 | NaN | NaN | NaN | NaN | 0.00 | NaN | 8 | 13.13 | 34.0 | 55.79 | 98.0 | 160.80 | 128.0 | 210.02 | 608.0 | 997.60 | 2250.0 | 3691.79 | 125.0 | 205.10 | 9.0 | 14.77 | 439.73 | 4909.26 |
| 8 | Fargocity | ND | 17.0 | 25700.0 | 1 | 74111 | 2.46 | 0.35 | 97.11 | 1.25 | 0.73 | 16.64 | 35.16 | 20.33 | 8.58 | 74115 | 100.0 | 25326 | 83.69 | 2.93 | 47.11 | 19.30 | 4.21 | 10.31 | 34269 | 13554 | 13727 | 8852 | 5344 | 8011 | 5332.0 | 5174 | 9603 | 13.68 | 5.52 | 11.27 | 30.24 | 4.18 | 68.51 | 6.89 | 31.55 | 11.37 | 29.43 | 7.29 | 40.87 | 9.94 | 8.64 | 3.00 | 79.35 | 79.70 | 86.60 | 80.70 | 74.32 | 78.51 | 751 | 1.64 | 1474 | 23.68 | 33.58 | 46.68 | 53.93 | 0.47 | 0.67 | 0.93 | 1.07 | 95.21 | 0.43 | 2.59 | 1.54 | 2.32 | 2.77 | 1.91 | 57.42 | 1.67 | 59.32 | 2 | 1562 | 95.07 | 48.10 | 0.45 | 25.61 | 1971 | 2.66 | 0.19 | 54500 | 70300 | 93700 | 39200 | 241 | 321 | 387 | 146 | 355 | 25.2 | 20.7 | 12.8 | 125 | 15 | 1.99 | 58.82 | 40.72 | 67.97 | 81.39 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 30.9 | 2399.3 | 1.41 | NaN | NaN | NaN | NaN | 0.00 | NaN | 0 | 0.00 | 35.0 | 43.87 | 16.0 | 20.05 | 41.0 | 51.39 | 425.0 | 532.66 | 3149.0 | 3946.71 | 206.0 | 258.18 | 8.0 | 10.03 | 115.31 | 4747.58 |
| 9 | Wacocity | TX | NaN | NaN | 1 | 103590 | 2.62 | 23.14 | 67.60 | 0.92 | 16.35 | 19.88 | 34.55 | 21.62 | 13.12 | 103590 | 100.0 | 17852 | 74.20 | 0.86 | 30.98 | 29.09 | 9.06 | 13.99 | 24058 | 10195 | 12126 | 5715 | 11313 | 5770 | 7320.0 | 6984 | 27767 | 28.68 | 13.01 | 31.62 | 17.02 | 8.39 | 51.37 | 15.73 | 29.06 | 16.43 | 24.30 | 11.07 | 38.49 | 14.66 | 12.97 | 3.11 | 61.65 | 54.56 | 68.85 | 61.69 | 60.80 | 69.23 | 3537 | 4.71 | 4793 | 15.54 | 23.08 | 35.32 | 49.82 | 0.72 | 1.07 | 1.63 | 2.31 | 85.72 | 2.51 | 6.70 | 4.10 | 2.45 | 2.47 | 2.44 | 46.82 | 6.14 | 59.96 | 2 | 5606 | 87.57 | 46.51 | 5.64 | 37.57 | 1960 | 11.74 | 0.33 | 28600 | 43100 | 67400 | 38800 | 192 | 281 | 369 | 177 | 353 | 29.6 | 19.4 | 13.0 | 43 | 4 | 4.63 | 75.59 | 42.33 | 74.05 | 92.12 | 198.0 | 183.53 | 187.0 | 173.33 | 73432.0 | 68065.1 | 370.9 | 183.5 | 89.32 | 78.28 | 11.11 | 10.61 | 0.0 | 21.72 | 13.0 | 12.0 | 60.2 | 78.5 | 1319.3 | 0.76 | 100.0 | 9315474.0 | 94.44 | 10.0 | 6.57 | 86346.3 | 29 | 26.88 | 141.0 | 130.69 | 453.0 | 419.89 | 1043.0 | 966.77 | 2397.0 | 2221.81 | 6121.0 | 5673.63 | 1070.0 | 991.80 | 18.0 | 16.68 | 1544.24 | 8903.93 |
#check number of rows
len(crime_data)
2215
#check datatypes again
datatypes2 = crime_data.dtypes
print(datatypes2)
#display the sum of all NaN in each column
crime_data.isnull().sum()
#seperate all columns with missing values
null_cols = crime_data[crime_data.columns[crime_data.isna().any()]]
null_cols.head()
| countyCode | communityCode | OtherPerCap | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | PolicBudgPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39.0 | 5320.0 | 5115.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.00 | 1.0 | 8.20 | 4.0 | 32.81 | 14.0 | 114.85 | 138.0 | 1132.08 | 16.0 | 131.26 | 2.0 | 16.41 | 41.02 | 1394.59 |
| 1 | 45.0 | 47616.0 | 5250.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 4.25 | 5.0 | 21.26 | 24.0 | 102.05 | 57.0 | 242.37 | 376.0 | 1598.78 | 26.0 | 110.55 | 1.0 | 4.25 | 127.56 | 1955.95 |
| 2 | NaN | NaN | 5954.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | 16.60 | 56.0 | 154.95 | 14.0 | 38.74 | 274.0 | 758.14 | 1797.0 | 4972.19 | 136.0 | 376.30 | 22.0 | 60.87 | 218.59 | 6167.51 |
| 3 | 35.0 | 29443.0 | 2451.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | 57.86 | 10.0 | 57.86 | 33.0 | 190.93 | 225.0 | 1301.78 | 716.0 | 4142.56 | 47.0 | 271.93 | NaN | NaN | 306.64 | NaN |
| 4 | 7.0 | 5068.0 | 3000.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | 32.04 | 14.0 | 112.14 | 91.0 | 728.93 | 1060.0 | 8490.87 | 91.0 | 728.93 | 5.0 | 40.05 | NaN | 9988.79 |
#display sums of missing values of only columns that contain missing values
null_list= null_cols.columns.values
null_cols.isnull().sum()
#change settings to display all columns
pd.set_option('display.max_columns', None)
#get summary statistics of all columns
crime_data.describe()
| countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 994.000000 | 991.000000 | 2215.000000 | 2.215000e+03 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2.215000e+03 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2214.000000 | 2215.000000 | 2.215000e+03 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2.215000e+03 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 3.430000e+02 | 3.430000e+02 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 343.000000 | 2215.000000 | 2215.000000 | 2215.000000 | 343.000000 | 3.430000e+02 | 343.000000 | 343.000000 | 2215.000000 | 3.430000e+02 | 2215.000000 | 2215.000000 | 2007.000000 | 2007.00000 | 2214.000000 | 2214.000000 | 2202.000000 | 2202.000000 | 2212.000000 | 2212.000000 | 2212.000000 | 2212.000000 | 2212.000000 | 2212.000000 | 2124.000000 | 2124.000000 | 1994.000000 | 2118.000000 |
| mean | 65.587525 | 45209.251261 | 5.494357 | 5.311798e+04 | 2.707327 | 9.335102 | 83.979819 | 2.670203 | 7.950176 | 14.445837 | 27.644840 | 13.975142 | 11.836393 | 4.773472e+04 | 70.465309 | 33984.696163 | 78.312758 | 0.881842 | 43.750935 | 26.409418 | 6.801445 | 15.969002 | 39857.055079 | 15603.524605 | 16567.698420 | 11541.749436 | 12229.191422 | 14227.989616 | 9442.765131 | 11018.998194 | 7.590853e+03 | 11.620537 | 9.186646 | 22.305120 | 23.056876 | 6.045242 | 62.021612 | 18.228907 | 24.532298 | 13.819165 | 28.209201 | 9.127585 | 30.683517 | 12.325300 | 10.812515 | 3.129698 | 74.059129 | 71.227255 | 81.865422 | 75.521788 | 60.542641 | 68.854795 | 2141.418962 | 3.115499 | 6.277274e+03 | 13.525693 | 20.421287 | 27.544181 | 34.733928 | 1.099124 | 1.697463 | 2.307503 | 2.943761 | 87.074993 | 2.405792 | 5.386619 | 3.915788 | 2.615842 | 2.740483 | 2.367138 | 66.369454 | 4.132438 | 45.405341 | 2.640632 | 1748.368849 | 92.933973 | 63.368298 | 2.778524 | 34.773887 | 1962.623476 | 4.289824 | 0.425273 | 88695.802257 | 113097.523251 | 145318.257788 | 56622.455530 | 329.966591 | 428.537246 | 527.252822 | 197.286230 | 501.466366 | 26.298104 | 20.990158 | 13.010203 | 66.953499 | 17.823476 | 7.340302 | 61.539630 | 51.538596 | 77.411079 | 88.111865 | 499.198251 | 246.490962 | 432.559767 | 210.844781 | 2.524050e+05 | 1.206517e+05 | 523.658309 | 246.493586 | 85.499679 | 82.515831 | 9.263294 | 5.459767 | 0.681283 | 15.242245 | 26.288630 | 8.816327 | 119.114286 | 27.419955 | 2783.835034 | 3.041124 | 185.478134 | 3.217602e+07 | 87.130933 | 4.285714 | 0.980163 | 1.535779e+05 | 7.764786 | 5.859296 | 28.046338 | 36.25848 | 237.952123 | 162.612597 | 326.528156 | 378.004605 | 761.236890 | 1033.430203 | 2137.629295 | 3372.979150 | 516.692586 | 473.965628 | 30.907721 | 32.153682 | 589.078922 | 4908.241804 |
| std | 117.831399 | 25425.861573 | 2.872924 | 2.046203e+05 | 0.334120 | 14.247156 | 16.419080 | 4.473843 | 14.589832 | 4.518623 | 6.181517 | 5.970747 | 4.777565 | 2.056067e+05 | 44.080275 | 13424.680011 | 7.950672 | 0.689006 | 12.787925 | 8.295604 | 4.700335 | 4.622553 | 14251.206032 | 6281.558523 | 6346.840251 | 9232.102062 | 14853.836177 | 9881.266395 | 7926.466713 | 5884.063446 | 3.936146e+04 | 8.600352 | 6.666703 | 10.989517 | 12.687213 | 2.895618 | 8.312045 | 8.099281 | 6.659470 | 6.430264 | 9.326123 | 2.802747 | 8.127991 | 3.262613 | 3.000883 | 0.240743 | 10.525952 | 12.045048 | 12.263736 | 10.365262 | 8.008937 | 6.679960 | 14692.582838 | 3.127681 | 5.541965e+04 | 9.780098 | 12.410355 | 14.368813 | 16.327322 | 1.595766 | 2.461060 | 3.286648 | 4.246468 | 14.076087 | 4.210368 | 3.794309 | 3.175770 | 0.315646 | 0.297421 | 0.391806 | 14.182588 | 5.599131 | 13.778347 | 0.512686 | 6503.866478 | 5.040736 | 13.970057 | 3.592396 | 13.911468 | 11.166555 | 4.088175 | 0.426188 | 66670.781534 | 81906.362277 | 99030.913816 | 39106.498041 | 144.138461 | 170.706644 | 199.290780 | 85.205688 | 169.271735 | 2.979297 | 2.987622 | 1.419679 | 564.253149 | 245.452553 | 8.418476 | 16.750061 | 10.517926 | 10.878186 | 7.287836 | 1681.472251 | 273.799162 | 1493.708385 | 235.478815 | 6.894498e+05 | 1.482113e+05 | 307.839007 | 273.798409 | 10.941312 | 15.332612 | 11.021424 | 10.604533 | 1.706344 | 14.826756 | 100.821921 | 2.836391 | 92.495186 | 109.822600 | 2828.993341 | 4.912917 | 318.542834 | 1.104566e+08 | 10.349612 | 4.064538 | 2.877128 | 2.030409e+05 | 58.166468 | 9.156829 | 105.616135 | 34.23975 | 2250.720788 | 234.486624 | 1987.947941 | 438.238599 | 3111.702756 | 763.354442 | 7600.573464 | 1901.316145 | 3258.164244 | 504.666026 | 180.125248 | 39.240900 | 614.784518 | 2739.708901 |
| min | 1.000000 | 70.000000 | 1.000000 | 1.000500e+04 | 1.600000 | 0.000000 | 2.680000 | 0.030000 | 0.120000 | 4.580000 | 9.380000 | 4.640000 | 1.660000 | 0.000000e+00 | 0.000000 | 8866.000000 | 31.680000 | 0.000000 | 5.810000 | 4.810000 | 0.180000 | 3.460000 | 10447.000000 | 5237.000000 | 5472.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 7.800000e+01 | 0.640000 | 0.200000 | 1.460000 | 1.630000 | 1.320000 | 24.820000 | 2.050000 | 8.690000 | 1.370000 | 6.480000 | 2.130000 | 12.060000 | 3.350000 | 2.830000 | 2.290000 | 22.970000 | 18.300000 | 8.700000 | 20.200000 | 24.420000 | 41.950000 | 0.000000 | 0.000000 | 2.000000e+01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.150000 | 0.000000 | 0.960000 | 0.440000 | 1.580000 | 1.610000 | 1.550000 | 13.930000 | 0.050000 | 3.060000 | 1.000000 | 36.000000 | 37.470000 | 16.860000 | 0.000000 | 3.120000 | 1939.000000 | 0.000000 | 0.000000 | 14999.000000 | 19500.000000 | 28200.000000 | 0.000000 | 99.000000 | 120.000000 | 182.000000 | 0.000000 | 192.000000 | 14.900000 | 14.000000 | 10.100000 | 0.000000 | 0.000000 | 0.180000 | 6.750000 | 11.830000 | 27.950000 | 32.830000 | 65.000000 | 29.400000 | 14.000000 | 19.210000 | 2.100000e+03 | 2.704800e+03 | 20.800000 | 29.400000 | 42.150000 | 1.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.900000 | 10.000000 | 0.000000 | 20.000000 | 2.380215e+06 | 10.850000 | 0.000000 | 0.000000 | 1.526040e+04 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 16.920000 | 10.000000 | 77.860000 | 1.000000 | 6.550000 | 0.000000 | 0.000000 | 0.000000 | 116.790000 |
| 25% | 11.000000 | 22887.000000 | 3.000000 | 1.436600e+04 | 2.500000 | 0.860000 | 76.320000 | 0.620000 | 0.930000 | 12.250000 | 24.415000 | 11.320000 | 8.750000 | 0.000000e+00 | 0.000000 | 23817.000000 | 73.400000 | 0.460000 | 34.680000 | 20.770000 | 3.270000 | 12.875000 | 29538.000000 | 11602.500000 | 12610.500000 | 6742.500000 | 6345.000000 | 8285.500000 | 5528.250000 | 7274.000000 | 9.125000e+02 | 4.510000 | 4.640000 | 13.920000 | 14.095000 | 4.045000 | 56.490000 | 12.215000 | 20.075000 | 9.130000 | 21.910000 | 7.110000 | 25.450000 | 9.860000 | 8.575000 | 2.990000 | 67.900000 | 63.990000 | 74.780000 | 70.170000 | 55.430000 | 64.900000 | 147.000000 | 1.070000 | 4.000000e+02 | 6.695000 | 11.255000 | 17.205000 | 22.725000 | 0.170000 | 0.280000 | 0.390000 | 0.520000 | 84.380000 | 0.510000 | 3.390000 | 2.370000 | 2.410000 | 2.550000 | 2.110000 | 57.285000 | 1.290000 | 37.505000 | 2.000000 | 304.500000 | 91.290000 | 54.820000 | 0.720000 | 24.480000 | 1956.000000 | 0.905000 | 0.160000 | 41500.000000 | 56200.000000 | 74300.000000 | 32200.000000 | 213.500000 | 289.500000 | 366.000000 | 139.000000 | 364.000000 | 24.300000 | 18.700000 | 12.000000 | 0.000000 | 0.000000 | 2.060000 | 50.110000 | 44.995000 | 72.060000 | 85.200000 | 131.000000 | 149.115000 | 114.000000 | 132.245000 | 4.986450e+04 | 6.484505e+04 | 343.350000 | 149.150000 | 79.435000 | 76.230000 | 2.020000 | 0.415000 | 0.000000 | 4.950000 | 6.000000 | 7.000000 | 55.100000 | 7.300000 | 1181.900000 | 0.360000 | 54.000000 | 7.275060e+06 | 84.295000 | 0.000000 | 0.000000 | 8.809435e+04 | 0.000000 | 0.000000 | 2.000000 | 11.53500 | 5.000000 | 27.647500 | 18.000000 | 94.187500 | 95.000000 | 511.690000 | 392.000000 | 2040.080000 | 30.000000 | 156.952500 | 1.000000 | 7.670000 | 161.700000 | 2918.070000 |
| 50% | 27.000000 | 46925.000000 | 5.000000 | 2.279200e+04 | 2.660000 | 2.870000 | 90.350000 | 1.230000 | 2.180000 | 13.620000 | 26.780000 | 12.540000 | 11.730000 | 1.804100e+04 | 100.000000 | 31441.000000 | 78.610000 | 0.690000 | 42.880000 | 26.590000 | 5.610000 | 15.650000 | 36678.000000 | 14101.000000 | 15073.000000 | 9777.000000 | 9895.000000 | 12250.000000 | 8186.000000 | 9721.000000 | 2.142000e+03 | 9.330000 | 7.740000 | 21.380000 | 19.650000 | 5.450000 | 62.440000 | 17.300000 | 23.390000 | 13.150000 | 26.240000 | 9.150000 | 29.000000 | 12.520000 | 10.900000 | 3.100000 | 75.030000 | 72.530000 | 83.990000 | 76.920000 | 60.710000 | 69.230000 | 352.000000 | 2.040000 | 1.024000e+03 | 12.260000 | 19.080000 | 26.720000 | 34.790000 | 0.500000 | 0.750000 | 1.040000 | 1.310000 | 92.180000 | 0.920000 | 4.280000 | 3.050000 | 2.570000 | 2.710000 | 2.290000 | 65.910000 | 2.340000 | 46.390000 | 3.000000 | 558.000000 | 94.210000 | 62.830000 | 1.660000 | 34.100000 | 1964.000000 | 2.850000 | 0.320000 | 65500.000000 | 82800.000000 | 106700.000000 | 43400.000000 | 307.000000 | 397.000000 | 486.000000 | 171.000000 | 467.000000 | 26.100000 | 21.000000 | 12.800000 | 0.000000 | 0.000000 | 4.310000 | 64.490000 | 52.170000 | 79.490000 | 90.030000 | 173.000000 | 196.010000 | 152.000000 | 170.270000 | 9.000000e+04 | 9.103460e+04 | 443.200000 | 196.000000 | 87.930000 | 86.180000 | 5.000000 | 2.040000 | 0.000000 | 11.370000 | 12.000000 | 9.000000 | 98.700000 | 13.700000 | 2027.300000 | 1.220000 | 86.000000 | 1.116411e+07 | 89.580000 | 5.000000 | 0.000000 | 1.145820e+05 | 1.000000 | 2.170000 | 7.000000 | 26.92000 | 19.000000 | 74.800000 | 56.000000 | 226.525000 | 205.000000 | 822.715000 | 747.000000 | 3079.510000 | 75.000000 | 302.355000 | 5.000000 | 21.080000 | 374.060000 | 4425.450000 |
| 75% | 80.500000 | 65805.000000 | 8.000000 | 4.302400e+04 | 2.850000 | 11.145000 | 96.225000 | 2.670000 | 7.810000 | 15.360000 | 29.205000 | 14.345000 | 14.415000 | 4.191800e+04 | 100.000000 | 41480.500000 | 84.030000 | 1.100000 | 52.740000 | 31.715000 | 9.105000 | 18.755000 | 46999.000000 | 17795.000000 | 18609.500000 | 14526.000000 | 14757.500000 | 17327.500000 | 11525.500000 | 13418.000000 | 4.988000e+03 | 16.905000 | 11.835000 | 29.195000 | 29.055000 | 7.440000 | 67.825000 | 23.400000 | 27.590000 | 17.665000 | 32.815000 | 11.050000 | 33.410000 | 14.745000 | 12.985000 | 3.220000 | 81.900000 | 80.395000 | 91.675000 | 82.765000 | 65.985000 | 73.495000 | 1031.500000 | 3.910000 | 3.302000e+03 | 17.950000 | 27.445000 | 36.495000 | 46.185000 | 1.310000 | 2.015000 | 2.700000 | 3.455000 | 95.455000 | 2.270000 | 5.870000 | 4.210000 | 2.770000 | 2.900000 | 2.530000 | 76.580000 | 4.730000 | 53.515000 | 3.000000 | 1228.000000 | 96.020000 | 72.645000 | 3.430000 | 43.970000 | 1971.000000 | 6.805000 | 0.555000 | 121500.000000 | 150600.000000 | 188000.000000 | 65450.000000 | 421.000000 | 544.000000 | 659.500000 | 232.500000 | 615.000000 | 28.000000 | 23.100000 | 13.700000 | 22.000000 | 1.000000 | 9.250000 | 74.855000 | 58.740000 | 85.135000 | 93.010000 | 314.000000 | 260.650000 | 285.500000 | 226.815000 | 1.719235e+05 | 1.303246e+05 | 637.250000 | 260.650000 | 93.645000 | 93.340000 | 14.065000 | 6.215000 | 0.650000 | 19.740000 | 23.000000 | 10.500000 | 153.550000 | 26.100000 | 3321.700000 | 3.365000 | 189.500000 | 2.014754e+07 | 93.200000 | 10.000000 | 0.000000 | 1.556557e+05 | 3.000000 | 8.365000 | 19.000000 | 51.47000 | 70.000000 | 187.155000 | 180.000000 | 504.387500 | 508.000000 | 1350.232500 | 1675.000000 | 4335.410000 | 232.500000 | 589.775000 | 16.000000 | 42.852500 | 794.400000 | 6229.280000 |
| max | 840.000000 | 94597.000000 | 10.000000 | 7.322564e+06 | 5.280000 | 96.670000 | 99.630000 | 57.460000 | 95.290000 | 54.400000 | 70.510000 | 63.620000 | 52.770000 | 7.322564e+06 | 100.000000 | 123625.000000 | 96.760000 | 6.530000 | 89.040000 | 76.390000 | 44.820000 | 45.510000 | 139008.000000 | 63302.000000 | 68850.000000 | 212120.000000 | 480000.000000 | 106165.000000 | 137000.000000 | 54648.000000 | 1.384994e+06 | 58.000000 | 49.890000 | 73.660000 | 79.180000 | 31.230000 | 84.670000 | 50.030000 | 62.670000 | 44.270000 | 64.970000 | 20.080000 | 76.600000 | 23.920000 | 22.230000 | 4.640000 | 93.600000 | 92.580000 | 100.000000 | 97.340000 | 87.970000 | 89.370000 | 527557.000000 | 27.350000 | 2.082931e+06 | 64.290000 | 76.160000 | 80.810000 | 88.000000 | 13.710000 | 19.930000 | 25.340000 | 32.630000 | 98.980000 | 38.330000 | 34.870000 | 30.870000 | 4.520000 | 4.480000 | 4.730000 | 97.240000 | 59.490000 | 95.340000 | 4.000000 | 172768.000000 | 99.000000 | 96.490000 | 39.890000 | 82.130000 | 1987.000000 | 23.880000 | 5.330000 | 500001.000000 | 500001.000000 | 500001.000000 | 331000.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 803.000000 | 1001.000000 | 35.100000 | 32.700000 | 23.400000 | 23383.000000 | 10447.000000 | 60.400000 | 93.140000 | 78.560000 | 96.590000 | 99.900000 | 25655.000000 | 3437.230000 | 22496.000000 | 3290.620000 | 8.328470e+06 | 1.926282e+06 | 2162.500000 | 3437.200000 | 100.000000 | 100.000000 | 67.310000 | 98.400000 | 18.570000 | 98.400000 | 1773.000000 | 15.000000 | 634.700000 | 3569.800000 | 44229.900000 | 54.330000 | 3187.000000 | 1.617293e+09 | 99.940000 | 10.000000 | 48.440000 | 2.422367e+06 | 1946.000000 | 91.090000 | 2818.000000 | 401.35000 | 86001.000000 | 2264.130000 | 62778.000000 | 4932.500000 | 99207.000000 | 11881.020000 | 235132.000000 | 25910.550000 | 112464.000000 | 4968.590000 | 5119.000000 | 436.370000 | 4877.060000 | 27119.760000 |
crime_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2215 entries, 0 to 2214 Columns: 147 entries, communityname to nonViolPerPop dtypes: float64(116), int64(29), object(2) memory usage: 2.5+ MB
#find the mean of each column by state
mean_data = crime_data.groupby("state").mean()
mean_data.head()
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\3113613801.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
mean_data = crime_data.groupby("state").mean()
| countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| state | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| AK | NaN | NaN | 7.333333 | 94644.000000 | 2.756667 | 6.826667 | 77.906667 | 4.136667 | 4.086667 | 13.986667 | 29.586667 | 13.966667 | 4.303333 | 73915.000000 | 32.656667 | 41301.000000 | 89.586667 | 1.083333 | 74.923333 | 10.960000 | 6.843333 | 13.766667 | 46051.333333 | 18068.333333 | 19928.333333 | 12596.666667 | 10115.333333 | 12005.333333 | 12153.000000 | 11534.666667 | 6681.666667 | 7.683333 | 3.480000 | 11.176667 | 25.300000 | 6.610000 | 72.913333 | 3.343333 | 24.726667 | 8.743333 | 31.840000 | 12.896667 | 31.660000 | 16.116667 | 14.440000 | 3.196667 | 74.700000 | 73.003333 | 81.053333 | 78.000000 | 63.173333 | 72.633333 | 1865.333333 | 3.133333 | 5397.333333 | 12.216667 | 20.800000 | 32.660000 | 40.870000 | 0.653333 | 1.106667 | 1.730000 | 2.153333 | 90.686667 | 1.236667 | 5.390000 | 3.676667 | 2.653333 | 2.783333 | 2.503333 | 49.940000 | 5.586667 | 53.32000 | 2.333333 | 4613.000000 | 89.246667 | 47.370000 | 5.843333 | 39.796667 | 1973.666667 | 5.483333 | 1.216667 | 78433.333333 | 102866.666667 | 132533.333333 | 54100.000000 | 407.000000 | 527.333333 | 673.666667 | 266.666667 | 580.000000 | 24.900000 | 21.500000 | 11.300000 | 130.000000 | 25.000000 | 5.243333 | 28.786667 | 35.826667 | 64.576667 | 72.073333 | 266.000000 | 104.930000 | 252.000000 | 99.410000 | 107811.000000 | 42529.000000 | 405.300000 | 104.900000 | 89.390000 | 89.470000 | 4.140000 | 1.880000 | 2.630000 | 8.650000 | 16.000000 | 11.000000 | 0.000000 | 1492.733333 | 363.033333 | 2.916667 | 301.000000 | 2.941312e+07 | 94.740000 | 10.000000 | 2.006667 | 116028.100000 | 9.666667 | 8.870000 | 83.000000 | 66.080000 | 217.000000 | 156.103333 | 523.666667 | 345.516667 | 753.000000 | 639.463333 | 4325.666667 | 3813.090000 | 562.333333 | 485.286667 | 38.000000 | 23.833333 | 576.576667 | 4961.673333 |
| AL | NaN | NaN | 6.000000 | 39231.186047 | 2.611860 | 27.044651 | 71.799535 | 0.750000 | 0.665116 | 16.117209 | 28.540233 | 15.227209 | 12.415581 | 34182.069767 | 64.992558 | 26034.860465 | 75.144419 | 0.855814 | 33.825349 | 28.445814 | 8.100930 | 15.898837 | 33033.232558 | 13065.651163 | 14933.186047 | 7357.930233 | 12009.325581 | 12114.069767 | 6510.232558 | 10418.418605 | 7405.232558 | 17.863256 | 11.406977 | 27.426279 | 23.013023 | 6.635581 | 57.156047 | 19.573953 | 26.390698 | 16.170233 | 28.505814 | 9.366047 | 29.517907 | 13.030698 | 11.367674 | 3.075116 | 69.230698 | 64.306744 | 76.904186 | 70.583023 | 63.534884 | 69.728837 | 1965.325581 | 4.789070 | 687.255814 | 18.807209 | 28.404186 | 33.491860 | 42.968140 | 0.294884 | 0.434419 | 0.531163 | 0.648837 | 96.534651 | 0.424186 | 4.433023 | 3.093023 | 2.520000 | 2.629767 | 2.335349 | 65.245581 | 2.924186 | 41.71000 | 2.860465 | 1366.395349 | 92.252093 | 62.581628 | 2.883953 | 37.815349 | 1966.976744 | 6.644186 | 0.640930 | 46648.837209 | 63955.813953 | 89079.069767 | 42430.232558 | 170.465116 | 246.697674 | 325.372093 | 154.906977 | 336.627907 | 25.218605 | 18.051163 | 12.688372 | 32.093023 | 7.883721 | 1.483488 | 71.264186 | 51.849767 | 78.885349 | 87.704419 | 293.833333 | 184.121667 | 252.500000 | 155.971667 | 141162.166667 | 87517.881667 | 431.033333 | 184.150000 | 81.233333 | 75.255000 | 23.325000 | 0.425000 | 0.720000 | 24.471667 | 14.333333 | 8.500000 | 164.500000 | 37.248837 | 1143.448837 | 0.523023 | 167.166667 | 1.356148e+07 | 81.938333 | 5.833333 | 0.753953 | 82293.251667 | 7.581395 | 11.342093 | 24.325581 | 42.825349 | 135.720930 | 199.333721 | 374.093023 | 777.197674 | 708.418605 | 1273.725814 | 1869.255814 | 3730.038605 | 260.720930 | 399.103953 | 30.812500 | 35.178750 | 1030.699070 | 6025.023750 |
| AR | NaN | NaN | 5.440000 | 32674.520000 | 2.652800 | 19.470000 | 78.883600 | 0.835600 | 0.979200 | 16.207600 | 29.028800 | 15.354400 | 12.666400 | 20450.480000 | 40.000000 | 22025.560000 | 73.618400 | 1.258000 | 32.592800 | 29.241200 | 8.465200 | 14.075200 | 27477.320000 | 11245.760000 | 12798.640000 | 7215.760000 | 8606.000000 | 10218.360000 | 9461.120000 | 7651.240000 | 5535.720000 | 18.780800 | 12.641600 | 29.064400 | 16.950800 | 6.596000 | 58.104000 | 19.264000 | 24.798400 | 17.750800 | 23.689200 | 10.165600 | 27.406400 | 13.674000 | 12.065200 | 3.058000 | 69.728400 | 65.208800 | 76.107600 | 71.822800 | 64.511200 | 71.568800 | 1191.000000 | 3.995200 | 547.760000 | 19.377200 | 25.513200 | 37.754800 | 43.952000 | 0.251600 | 0.360400 | 0.540400 | 0.656000 | 96.800800 | 0.546000 | 4.402000 | 3.130800 | 2.522800 | 2.589200 | 2.421600 | 62.018800 | 3.427600 | 45.65840 | 2.880000 | 1280.440000 | 91.835600 | 60.400000 | 3.585200 | 37.975200 | 1968.120000 | 9.751200 | 0.581600 | 37220.000000 | 51488.000000 | 71760.000000 | 34540.000000 | 176.960000 | 245.360000 | 313.600000 | 136.640000 | 339.800000 | 26.880000 | 19.428000 | 13.452000 | 15.200000 | 1.440000 | 1.369200 | 63.527600 | 48.319200 | 75.919200 | 85.816000 | 111.666667 | 147.090000 | 88.000000 | 117.630000 | 51710.000000 | 73882.933333 | 447.833333 | 147.100000 | 76.176667 | 85.573333 | 14.156667 | 0.000000 | 0.270000 | 14.426667 | 9.333333 | 7.333333 | 42.933333 | 26.832000 | 1241.868000 | 0.415200 | 70.666667 | 5.969266e+06 | 78.543333 | 3.333333 | 0.979600 | 67721.166667 | 5.680000 | 10.735600 | 25.440000 | 52.083600 | 102.600000 | 195.645600 | 290.560000 | 460.085200 | 652.680000 | 1450.826400 | 1856.600000 | 4575.336800 | 222.760000 | 450.897200 | 15.760000 | 32.514800 | 718.550400 | 6509.575600 |
| AZ | NaN | NaN | 5.700000 | 125811.650000 | 2.778500 | 2.610500 | 84.655500 | 1.574500 | 22.434500 | 14.850000 | 27.722500 | 13.808500 | 11.690500 | 114430.000000 | 54.660000 | 32264.500000 | 77.323000 | 0.952000 | 38.545500 | 25.294000 | 6.444000 | 16.827500 | 37024.700000 | 15380.050000 | 16208.950000 | 14505.700000 | 10459.250000 | 12970.100000 | 7450.900000 | 9127.850000 | 17101.050000 | 15.036500 | 9.917500 | 21.542500 | 21.650500 | 7.012000 | 59.707500 | 12.751500 | 22.172500 | 11.422000 | 27.247000 | 9.869000 | 28.504500 | 13.229000 | 11.610000 | 3.175000 | 74.329500 | 70.758500 | 82.037000 | 76.480000 | 58.645500 | 66.129000 | 3050.750000 | 2.618000 | 10154.800000 | 14.418000 | 23.323000 | 30.726000 | 37.103500 | 1.459000 | 2.286000 | 2.925500 | 3.545000 | 78.144000 | 4.111000 | 7.035500 | 5.244500 | 2.706500 | 2.793000 | 2.571500 | 65.994500 | 6.886500 | 49.96900 | 2.550000 | 7948.600000 | 84.915000 | 64.188000 | 1.902000 | 27.634500 | 1975.900000 | 6.852500 | 0.329000 | 72980.000000 | 95985.000000 | 128820.050000 | 55840.050000 | 311.200000 | 405.650000 | 499.500000 | 188.300000 | 476.700000 | 28.175000 | 22.770000 | 12.635000 | 125.350000 | 30.350000 | 9.245500 | 29.936000 | 39.551000 | 68.547000 | 74.233000 | 611.833333 | 139.425000 | 575.000000 | 130.758333 | 512103.833333 | 116322.850000 | 849.000000 | 139.416667 | 96.275000 | 89.226667 | 1.680000 | 8.370000 | 0.415000 | 10.463333 | 21.500000 | 11.000000 | 161.200000 | 78.850000 | 1375.160000 | 1.103000 | 366.833333 | 4.265164e+07 | 93.966667 | 6.666667 | 1.469500 | 98273.283333 | 12.400000 | 4.613500 | 58.600000 | 30.195000 | 292.500000 | 119.256000 | 810.000000 | 397.030000 | 2281.450000 | 1510.476500 | 6869.450000 | 4325.365000 | 1501.850000 | 661.164500 | 45.100000 | 35.179000 | 551.094500 | 6532.186000 |
| CA | 76.0 | 93325.0 | 5.591398 | 79654.179211 | 2.926129 | 5.312222 | 71.973799 | 9.480896 | 24.799427 | 13.974158 | 28.252581 | 13.678136 | 9.780287 | 76204.000000 | 85.266989 | 39655.982079 | 80.120394 | 0.827849 | 40.754624 | 21.825520 | 8.877240 | 14.881900 | 44720.125448 | 17465.885305 | 18957.817204 | 13960.465950 | 14410.684588 | 15125.035842 | 10407.967742 | 11528.028674 | 9958.157706 | 10.789140 | 10.443297 | 22.795986 | 23.964158 | 6.350860 | 63.222581 | 16.795376 | 22.322186 | 12.847097 | 28.926344 | 9.677957 | 32.928100 | 14.002688 | 11.870323 | 3.304301 | 74.187312 | 70.442652 | 83.169928 | 75.711541 | 56.891039 | 64.844875 | 2630.627240 | 3.391505 | 19501.440860 | 15.708638 | 24.847025 | 34.214588 | 43.798459 | 3.295878 | 5.196667 | 7.149247 | 9.209498 | 70.336953 | 7.405412 | 9.233799 | 7.007849 | 2.851111 | 2.885735 | 2.788244 | 59.075663 | 11.628315 | 51.37086 | 2.448029 | 1725.663082 | 94.359534 | 58.164552 | 1.918746 | 20.730000 | 1967.225806 | 2.712939 | 0.429749 | 175336.935484 | 219367.068100 | 269462.820789 | 94125.885305 | 490.584229 | 607.885305 | 732.716846 | 242.132616 | 665.078853 | 28.691756 | 24.844086 | 11.708602 | 100.139785 | 49.885305 | 19.687204 | 48.018280 | 44.247742 | 74.198208 | 87.774265 | 458.061224 | 157.247959 | 386.244898 | 134.820204 | 311710.979592 | 103479.571020 | 694.083673 | 157.248980 | 82.908571 | 74.932041 | 7.714694 | 10.421837 | 3.077551 | 20.878571 | 24.877551 | 9.591837 | 181.189796 | 33.169534 | 4693.495699 | 3.265878 | 173.734694 | 3.961899e+07 | 87.499388 | 6.020408 | 1.124695 | 154858.814490 | 11.727599 | 8.134588 | 32.491039 | 32.646882 | 406.215054 | 273.137276 | 535.151079 | 496.704964 | 1098.870968 | 1248.996595 | 2694.913978 | 3022.680896 | 965.017921 | 819.774265 | 55.698925 | 47.988530 | 810.469388 | 5139.440251 |
#find the mean of crime per pop columns by state
state_crimes =crime_data.groupby("state")["murdPerPop", "rapesPerPop", "robbbPerPop", "assaultPerPop", "burglPerPop", "larcPerPop", "autoTheftPerPop", "arsonsPerPop"].mean()
state_crimes.head(10)
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\76998245.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
state_crimes =crime_data.groupby("state")["murdPerPop", "rapesPerPop", "robbbPerPop", "assaultPerPop", "burglPerPop", "larcPerPop", "autoTheftPerPop", "arsonsPerPop"].mean()
| murdPerPop | rapesPerPop | robbbPerPop | assaultPerPop | burglPerPop | larcPerPop | autoTheftPerPop | arsonsPerPop | |
|---|---|---|---|---|---|---|---|---|
| state | ||||||||
| AK | 8.870000 | 66.080000 | 156.103333 | 345.516667 | 639.463333 | 3813.090000 | 485.286667 | 23.833333 |
| AL | 11.342093 | 42.825349 | 199.333721 | 777.197674 | 1273.725814 | 3730.038605 | 399.103953 | 35.178750 |
| AR | 10.735600 | 52.083600 | 195.645600 | 460.085200 | 1450.826400 | 4575.336800 | 450.897200 | 32.514800 |
| AZ | 4.613500 | 30.195000 | 119.256000 | 397.030000 | 1510.476500 | 4325.365000 | 661.164500 | 35.179000 |
| CA | 8.134588 | 32.646882 | 273.137276 | 496.704964 | 1248.996595 | 3022.680896 | 819.774265 | 47.988530 |
| CO | 3.822800 | 41.320400 | 84.554000 | 397.163600 | 965.632800 | 4010.375600 | 350.701200 | 42.820400 |
| CT | 3.841972 | 16.033239 | 111.564366 | 127.755217 | 740.837746 | 2372.257465 | 425.045070 | 19.466338 |
| DC | 81.950000 | 58.480000 | 1282.850000 | 1625.090000 | 2081.590000 | 5679.780000 | 1454.870000 | 36.100000 |
| DE | 0.000000 | 123.330000 | 267.210000 | 496.750000 | 859.880000 | 6118.530000 | 260.360000 | 37.680000 |
| FL | 7.305556 | 49.775889 | 359.934778 | 742.029556 | 1932.676667 | 5190.722333 | 866.247889 | 21.462111 |
#find the states with min and max values for crimes
state_crimes["murdPerPop"].sort_values()
state_crimes["rapesPerPop"].sort_values()
state_crimes["robbbPerPop"].sort_values()
state_crimes["assaultPerPop"].sort_values()
state_crimes["burglPerPop"].sort_values()
state_crimes["larcPerPop"].sort_values()
state_crimes["autoTheftPerPop"].sort_values()
state_crimes["arsonsPerPop"].sort_values()
#find representation of each state in dataset, how many rows does each state have?
crime_data.groupby(['state'])['state'].count().sort_values()
#Examine the STATE with the overall max values of each crime
crime_data.loc[crime_data['state'] == 'DC']
| communityname | state | countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1581 | Washingtoncity | DC | 1.0 | 50000.0 | 8 | 606900 | 2.43 | 65.84 | 29.6 | 1.85 | 5.39 | 13.54 | 29.83 | 15.69 | 11.53 | 606900 | 100.0 | 30727 | 78.16 | 0.5 | 34.26 | 20.22 | 8.94 | 17.44 | 36256 | 18881 | 34563 | 12226 | 14095 | 16498 | 10468.0 | 12525 | 96278 | 16.87 | 9.56 | 26.85 | 33.31 | 7.03 | 61.62 | 4.29 | 33.61 | 7.37 | 39.1 | 13.67 | 50.76 | 16.18 | 15.03 | 3.15 | 41.96 | 33.58 | 52.23 | 44.32 | 66.58 | 72.92 | 70523 | 18.2 | 58887 | 23.98 | 36.96 | 48.35 | 57.55 | 2.33 | 3.59 | 4.69 | 5.58 | 87.49 | 2.55 | 8.96 | 4.6 | 2.26 | 2.5 | 2.12 | 42.91 | 8.25 | 67.33 | 2 | 28855 | 89.64 | 38.9 | 17.4 | 44.05 | 1947 | 4.22 | 0.81 | 86700 | 123900 | 258700 | 172000 | 317 | 441 | 618 | 301 | 479 | 25.4 | 20.5 | 12.8 | 4682 | 131 | 9.7 | 39.34 | 53.49 | 76.61 | 76.61 | 4506.0 | 813.36 | 4066.0 | 733.94 | 871531.0 | 157316.09 | 193.4 | 813.4 | 97.32 | 32.2 | 64.4 | 2.57 | 0.75 | 65.42 | 198.0 | 13.0 | 197.9 | 63.6 | 9538.9 | 37.3 | 590.0 | 208184992.0 | 90.24 | 0.0 | 4.39 | 375785.19 | 454 | 81.95 | 324.0 | 58.48 | 7107.0 | 1282.85 | 9003.0 | 1625.09 | 11532.0 | 2081.59 | 31466.0 | 5679.78 | 8060.0 | 1454.87 | 200.0 | 36.1 | 3048.38 | 9252.35 |
#Sort the dataset by population
crime_data.sort_values(['population'])
#Subset the states with the most representation in the dataset
max_rep = ['CA', 'NJ', 'TX']
maxrep_subset = crime_data[crime_data.state.isin(max_rep)]
maxrep_subset.describe()
| countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 213.000000 | 213.000000 | 652.000000 | 6.520000e+02 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 6.520000e+02 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 6.520000e+02 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.00000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.00000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 652.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 1.100000e+02 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 110.000000 | 652.000000 | 652.000000 | 652.000000 | 110.000000 | 1.100000e+02 | 110.000000 | 110.000000 | 652.000000 | 110.000000 | 652.000000 | 652.000000 | 651.000000 | 651.000000 | 651.000000 | 651.000000 | 646.000000 | 646.000000 | 649.000000 | 649.000000 | 650.000000 | 650.000000 | 652.000000 | 652.000000 | 651.000000 | 651.000000 | 645.000000 | 647.000000 |
| mean | 19.215962 | 45170.629108 | 5.624233 | 5.946048e+04 | 2.855675 | 7.809647 | 77.526794 | 5.590368 | 18.911733 | 14.076104 | 27.651979 | 13.474862 | 10.642561 | 5.536025e+04 | 78.748129 | 38775.401840 | 80.323574 | 0.947699 | 42.681534 | 23.992347 | 7.033880 | 14.907301 | 44149.286810 | 17090.036810 | 18324.825153 | 13762.685583 | 14571.973926 | 15354.412577 | 10750.357362 | 11761.555215 | 7998.104294 | 10.885000 | 10.636656 | 23.483390 | 23.986012 | 6.171641 | 63.020828 | 16.390000 | 23.150000 | 12.702209 | 29.008221 | 8.804233 | 31.155844 | 12.393206 | 10.657147 | 3.253972 | 76.047853 | 72.412132 | 84.666319 | 77.069647 | 57.116626 | 65.847040 | 1931.239264 | 3.038865 | 1.137832e+04 | 13.715798 | 21.905031 | 30.579509 | 39.761610 | 2.179018 | 3.444632 | 4.747853 | 6.143037 | 75.763282 | 5.302960 | 7.452163 | 5.606672 | 2.780997 | 2.869233 | 2.612193 | 64.439525 | 7.83089 | 46.804586 | 2.579755 | 1797.153374 | 92.747883 | 62.434816 | 2.833482 | 32.559325 | 1964.822086 | 4.017209 | 0.500521 | 130855.375767 | 165164.895706 | 207417.072086 | 76561.696319 | 422.260736 | 535.31135 | 652.104294 | 229.843558 | 605.185583 | 27.204141 | 23.227301 | 13.182669 | 64.351227 | 25.808282 | 14.050905 | 54.592086 | 50.449969 | 76.195046 | 88.750890 | 386.645455 | 187.492182 | 328.300000 | 162.856455 | 2.316315e+05 | 94973.719909 | 548.017273 | 187.495455 | 83.070182 | 77.613000 | 7.260818 | 11.264182 | 1.458364 | 19.696182 | 19.381818 | 9.009091 | 131.661818 | 28.132362 | 3909.484509 | 4.185245 | 158.345455 | 2.768150e+07 | 87.694909 | 4.545455 | 0.994172 | 133338.572636 | 8.174847 | 6.437638 | 27.511521 | 32.601644 | 246.944700 | 200.858464 | 339.063467 | 395.167276 | 833.169492 | 1087.383436 | 2220.513846 | 3027.076031 | 652.769939 | 626.186319 | 38.989247 | 37.594240 | 633.092155 | 4771.453292 |
| std | 13.235541 | 23379.476968 | 2.891520 | 1.761253e+05 | 0.408726 | 11.403043 | 16.836132 | 7.078681 | 21.017267 | 3.468795 | 5.207060 | 4.374458 | 4.816070 | 1.771707e+05 | 39.884760 | 14992.840835 | 7.663542 | 0.720604 | 14.193613 | 8.329159 | 5.210186 | 4.670639 | 16305.422483 | 7518.350487 | 7664.944504 | 11781.611890 | 21453.858461 | 8306.152573 | 7484.655939 | 6152.004224 | 32504.649723 | 8.717475 | 9.042146 | 13.260618 | 13.906079 | 3.106506 | 8.291967 | 6.963107 | 6.493035 | 6.571939 | 10.689668 | 2.665646 | 7.469032 | 3.272733 | 2.920009 | 0.306269 | 9.115853 | 10.484327 | 10.127028 | 9.013849 | 7.681386 | 6.533787 | 9642.018433 | 2.968143 | 5.711001e+04 | 6.508180 | 9.211187 | 11.316008 | 13.783983 | 2.194139 | 3.377931 | 4.494551 | 5.832892 | 18.723682 | 6.274762 | 5.279596 | 4.428548 | 0.386238 | 0.347950 | 0.500160 | 15.662616 | 8.60621 | 16.396064 | 0.541450 | 6516.092183 | 5.937758 | 15.509878 | 3.600391 | 15.338566 | 10.768572 | 4.408227 | 0.548299 | 86051.081556 | 103692.962681 | 122233.431468 | 47804.118771 | 161.275321 | 188.10053 | 212.395831 | 86.642121 | 183.577918 | 2.978437 | 2.810971 | 1.903225 | 304.640980 | 172.904158 | 10.888643 | 12.661013 | 11.309996 | 10.228695 | 5.937045 | 920.540095 | 89.599880 | 825.405594 | 71.528482 | 5.986471e+05 | 56905.574568 | 320.126315 | 89.602001 | 12.075083 | 18.477372 | 10.003297 | 15.940309 | 2.661775 | 18.440615 | 42.284528 | 2.780982 | 100.500720 | 145.747217 | 4050.534161 | 5.706809 | 328.035477 | 5.733176e+07 | 9.596803 | 3.922732 | 2.635903 | 71346.126995 | 50.019629 | 8.563770 | 102.932993 | 29.623423 | 1633.680281 | 271.942800 | 1806.484322 | 362.778347 | 2758.257846 | 626.537587 | 7014.427370 | 1673.329990 | 2990.114617 | 605.970864 | 226.700972 | 43.713629 | 590.534673 | 2451.570076 |
| min | 1.000000 | 70.000000 | 1.000000 | 1.002300e+04 | 1.600000 | 0.000000 | 7.210000 | 0.060000 | 0.770000 | 4.580000 | 10.550000 | 4.930000 | 1.660000 | 0.000000e+00 | 0.000000 | 12908.000000 | 31.680000 | 0.000000 | 9.020000 | 4.810000 | 0.640000 | 3.460000 | 14257.000000 | 5237.000000 | 5472.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3188.000000 | 78.000000 | 0.730000 | 0.620000 | 2.520000 | 1.630000 | 1.610000 | 24.820000 | 3.060000 | 8.690000 | 1.370000 | 6.480000 | 2.350000 | 13.300000 | 4.490000 | 3.640000 | 2.500000 | 32.240000 | 26.110000 | 27.430000 | 30.640000 | 33.730000 | 44.650000 | 0.000000 | 0.000000 | 5.600000e+01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.150000 | 0.000000 | 1.560000 | 0.480000 | 1.580000 | 1.610000 | 1.580000 | 13.930000 | 0.18000 | 3.060000 | 1.000000 | 36.000000 | 37.470000 | 16.860000 | 0.000000 | 3.120000 | 1939.000000 | 0.000000 | 0.000000 | 15700.000000 | 27200.000000 | 42200.000000 | 0.000000 | 99.000000 | 144.00000 | 226.000000 | 0.000000 | 192.000000 | 14.900000 | 15.300000 | 10.100000 | 0.000000 | 0.000000 | 0.510000 | 20.210000 | 11.830000 | 27.950000 | 49.800000 | 70.000000 | 47.030000 | 69.000000 | 25.280000 | 1.415500e+04 | 15370.900000 | 72.400000 | 47.000000 | 42.310000 | 1.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.900000 | 16.300000 | 0.000000 | 20.000000 | 4.288914e+06 | 35.010000 | 0.000000 | 0.000000 | 37041.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.230000 | 6.000000 | 49.170000 | 75.000000 | 396.560000 | 1.000000 | 7.210000 | 0.000000 | 0.000000 | 15.030000 | 696.610000 |
| 25% | 7.000000 | 25770.000000 | 3.000000 | 1.559650e+04 | 2.610000 | 1.275000 | 68.767500 | 1.237500 | 4.420000 | 12.057500 | 24.670000 | 11.310000 | 7.480000 | 1.098100e+04 | 93.325000 | 27160.250000 | 76.090000 | 0.507500 | 31.957500 | 18.327500 | 3.020000 | 11.850000 | 31316.250000 | 11933.250000 | 13198.250000 | 8311.250000 | 8240.250000 | 10678.250000 | 6863.750000 | 7624.250000 | 918.250000 | 4.085000 | 4.537500 | 13.067500 | 13.527500 | 3.877500 | 58.337500 | 11.707500 | 19.095000 | 8.050000 | 21.442500 | 6.897500 | 26.310000 | 10.047500 | 8.637500 | 3.070000 | 70.917500 | 65.840000 | 78.687500 | 72.522500 | 51.855000 | 62.030000 | 161.750000 | 1.120000 | 1.200750e+03 | 9.107500 | 15.642500 | 23.212500 | 30.422500 | 0.620000 | 1.027500 | 1.437500 | 1.955000 | 69.727500 | 1.317500 | 4.137500 | 3.000000 | 2.540000 | 2.657500 | 2.247500 | 54.685000 | 2.32000 | 36.037500 | 2.000000 | 332.750000 | 90.530000 | 53.177500 | 0.780000 | 19.875000 | 1957.000000 | 0.730000 | 0.180000 | 63825.000000 | 81975.000000 | 105050.000000 | 41600.000000 | 302.500000 | 387.00000 | 479.000000 | 162.000000 | 455.750000 | 24.975000 | 21.475000 | 11.700000 | 0.000000 | 0.000000 | 5.742500 | 45.632500 | 42.310000 | 70.985000 | 86.175000 | 126.000000 | 125.192500 | 107.250000 | 112.335000 | 5.000000e+04 | 56083.725000 | 341.925000 | 125.200000 | 76.587500 | 70.150000 | 1.602500 | 2.862500 | 0.000000 | 7.937500 | 6.000000 | 7.000000 | 60.725000 | 5.700000 | 1501.675000 | 0.577500 | 52.000000 | 7.804816e+06 | 85.490000 | 0.000000 | 0.000000 | 91056.775000 | 0.000000 | 0.000000 | 3.000000 | 11.870000 | 9.000000 | 44.015000 | 28.000000 | 128.790000 | 130.000000 | 640.120000 | 390.250000 | 1799.760000 | 41.000000 | 217.457500 | 2.000000 | 11.165000 | 231.480000 | 2941.910000 |
| 50% | 21.000000 | 45990.000000 | 6.000000 | 2.676400e+04 | 2.795000 | 3.425000 | 81.355000 | 3.320000 | 10.785000 | 13.765000 | 27.245000 | 12.605000 | 10.210000 | 2.352500e+04 | 100.000000 | 36308.000000 | 80.370000 | 0.730000 | 42.265000 | 23.465000 | 5.475000 | 14.290000 | 41761.000000 | 15627.000000 | 16674.500000 | 12222.500000 | 12238.000000 | 14159.000000 | 9148.500000 | 10264.000000 | 2221.500000 | 7.805000 | 7.935000 | 21.550000 | 20.365000 | 5.380000 | 63.310000 | 15.795000 | 22.375000 | 12.075000 | 26.965000 | 8.805000 | 29.865000 | 12.605000 | 10.715000 | 3.200000 | 76.060000 | 72.680000 | 85.605000 | 77.460000 | 57.230000 | 66.560000 | 432.500000 | 2.210000 | 3.057500e+03 | 13.610000 | 21.710000 | 30.735000 | 40.180000 | 1.415000 | 2.320000 | 3.315000 | 4.205000 | 81.475000 | 2.835000 | 5.595000 | 4.130000 | 2.730000 | 2.855000 | 2.540000 | 64.350000 | 5.03500 | 47.845000 | 3.000000 | 612.000000 | 94.685000 | 62.340000 | 1.780000 | 30.815000 | 1966.000000 | 2.250000 | 0.350000 | 123500.000000 | 153200.000000 | 187550.000000 | 61600.000000 | 428.500000 | 544.50000 | 659.500000 | 215.000000 | 613.000000 | 27.300000 | 23.400000 | 12.700000 | 0.000000 | 0.000000 | 11.160000 | 53.800000 | 50.220000 | 78.245000 | 89.890000 | 161.000000 | 161.125000 | 140.000000 | 138.020000 | 7.524050e+04 | 80827.600000 | 450.950000 | 161.150000 | 85.965000 | 82.620000 | 3.345000 | 7.430000 | 0.095000 | 14.920000 | 9.000000 | 9.000000 | 110.600000 | 11.600000 | 2739.000000 | 2.005000 | 71.000000 | 1.194086e+07 | 89.910000 | 5.000000 | 0.000000 | 118573.900000 | 1.000000 | 3.995000 | 8.000000 | 24.610000 | 32.000000 | 106.070000 | 87.000000 | 277.020000 | 288.000000 | 983.410000 | 792.000000 | 2747.070000 | 131.500000 | 433.590000 | 8.000000 | 25.220000 | 461.980000 | 4520.890000 |
| 75% | 27.000000 | 63360.000000 | 8.000000 | 5.233275e+04 | 3.040000 | 8.917500 | 90.522500 | 6.635000 | 23.905000 | 15.480000 | 29.767500 | 14.475000 | 13.220000 | 5.198125e+04 | 100.000000 | 46892.500000 | 85.520000 | 1.162500 | 53.237500 | 29.192500 | 9.740000 | 17.380000 | 53311.250000 | 20130.000000 | 21471.000000 | 16412.500000 | 16644.500000 | 18218.750000 | 12565.750000 | 14357.500000 | 5541.500000 | 16.092500 | 13.322500 | 30.955000 | 31.772500 | 7.840000 | 68.592500 | 20.272500 | 26.230000 | 15.917500 | 35.010000 | 10.650000 | 34.792500 | 14.675000 | 12.745000 | 3.370000 | 82.675000 | 80.325000 | 92.537500 | 83.150000 | 62.312500 | 70.330000 | 1200.750000 | 3.782500 | 7.961000e+03 | 17.505000 | 27.945000 | 38.275000 | 49.940000 | 3.172500 | 4.907500 | 6.905000 | 8.855000 | 88.545000 | 6.542500 | 8.932500 | 6.357500 | 2.972500 | 3.020000 | 2.880000 | 75.432500 | 9.87000 | 57.645000 | 3.000000 | 1284.500000 | 96.307500 | 72.920000 | 3.592500 | 43.257500 | 1973.000000 | 5.892500 | 0.620000 | 175700.000000 | 213775.000000 | 275850.000000 | 97925.000000 | 535.000000 | 655.00000 | 814.500000 | 282.250000 | 728.000000 | 29.100000 | 25.000000 | 14.200000 | 25.250000 | 2.000000 | 19.002500 | 63.002500 | 58.712500 | 83.455000 | 92.572500 | 237.500000 | 234.467500 | 207.000000 | 202.740000 | 1.575000e+05 | 118636.875000 | 687.550000 | 234.475000 | 92.090000 | 89.612500 | 9.262500 | 12.350000 | 1.985000 | 24.750000 | 19.000000 | 11.000000 | 161.750000 | 23.900000 | 4966.375000 | 5.677500 | 114.750000 | 1.813866e+07 | 93.045000 | 10.000000 | 0.000000 | 158150.800000 | 4.000000 | 8.850000 | 19.000000 | 45.155000 | 116.500000 | 238.850000 | 232.750000 | 580.935000 | 665.000000 | 1415.030000 | 1773.500000 | 3862.060000 | 409.500000 | 857.282500 | 21.000000 | 48.000000 | 833.290000 | 5927.930000 |
| max | 81.000000 | 93480.000000 | 10.000000 | 3.485398e+06 | 5.280000 | 89.950000 | 99.170000 | 57.460000 | 95.290000 | 43.380000 | 66.640000 | 55.320000 | 52.770000 | 3.485398e+06 | 100.000000 | 123625.000000 | 96.760000 | 6.320000 | 79.430000 | 76.390000 | 26.920000 | 45.510000 | 131315.000000 | 63302.000000 | 68850.000000 | 212120.000000 | 480000.000000 | 82133.000000 | 125526.000000 | 51320.000000 | 643809.000000 | 48.820000 | 49.890000 | 73.660000 | 73.630000 | 23.830000 | 83.810000 | 41.690000 | 62.670000 | 44.270000 | 64.970000 | 19.090000 | 71.260000 | 23.460000 | 19.100000 | 4.640000 | 93.600000 | 92.580000 | 100.000000 | 97.340000 | 78.870000 | 83.960000 | 212238.000000 | 24.190000 | 1.336665e+06 | 38.520000 | 54.190000 | 69.760000 | 78.740000 | 13.710000 | 19.930000 | 25.340000 | 32.630000 | 97.330000 | 38.330000 | 34.870000 | 30.870000 | 4.520000 | 4.480000 | 4.730000 | 96.140000 | 59.49000 | 95.340000 | 4.000000 | 109558.000000 | 99.000000 | 96.220000 | 39.890000 | 78.590000 | 1987.000000 | 23.630000 | 5.330000 | 500001.000000 | 500001.000000 | 500001.000000 | 267700.000000 | 1001.000000 | 1001.00000 | 1001.000000 | 675.000000 | 1001.000000 | 35.100000 | 32.700000 | 23.400000 | 4597.000000 | 3109.000000 | 60.400000 | 92.040000 | 76.440000 | 95.850000 | 99.900000 | 8295.000000 | 623.660000 | 7683.000000 | 383.830000 | 5.480855e+06 | 316432.500000 | 2156.500000 | 623.700000 | 100.000000 | 100.000000 | 59.520000 | 98.400000 | 18.570000 | 98.400000 | 406.000000 | 14.000000 | 439.100000 | 3569.800000 | 44229.900000 | 52.590000 | 2482.000000 | 4.880505e+08 | 99.410000 | 10.000000 | 18.420000 | 546841.190000 | 1076.000000 | 63.940000 | 1773.000000 | 170.670000 | 38415.000000 | 2264.130000 | 42437.000000 | 2191.730000 | 50232.000000 | 4848.970000 | 119092.000000 | 22164.780000 | 59764.000000 | 4968.590000 | 5119.000000 | 377.610000 | 3928.030000 | 27010.770000 |
#Subset the states with the least representation
min_rep = ['KS', 'DE', 'DC', 'AK', 'VT']
minrep_subset = crime_data[crime_data.state.isin(min_rep)]
minrep_subset.describe()
| countyCode | communityCode | fold | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LemasSwornFT | LemasSwFTPerPop | LemasSwFTFieldOps | LemasSwFTFieldPerPop | LemasTotalReq | LemasTotReqPerPop | PolicReqPerOffic | PolicPerPop | RacialMatchCommPol | PctPolicWhite | PctPolicBlack | PctPolicHisp | PctPolicAsian | PctPolicMinor | OfficAssgnDrugUnits | NumKindsDrugsSeiz | PolicAveOTWorked | LandArea | PopDens | PctUsePubTrans | PolicCars | PolicOperBudg | LemasPctPolicOnPatr | LemasGangUnitDeploy | LemasPctOfficDrugUn | PolicBudgPerPop | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6.000000 | 6.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.00000 | 10.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.000000 | 10.00000 | 10.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 10.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.00000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.00000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.00000 | 4.000000 | 4.000000 | 4.00000 | 4.000000 | 10.000000 | 10.00000 | 10.000000 | 4.000000 | 4.000000e+00 | 4.000000 | 4.000000 | 10.000000 | 4.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 5.000000 | 5.000000 | 10.000000 | 5.000000 |
| mean | 36.000000 | 46991.666667 | 5.600000 | 130737.000000 | 2.644000 | 13.102000 | 80.219000 | 2.356000 | 2.900000 | 15.926000 | 31.567000 | 17.090000 | 8.383000 | 122225.900000 | 66.679000 | 34658.500000 | 83.24900 | 0.890000 | 53.73300 | 19.05700 | 6.652000 | 14.514000 | 40603.800000 | 16394.200000 | 18934.400000 | 11684.700000 | 10802.300000 | 11941.500000 | 7550.700000 | 11422.800000 | 16641.300000 | 10.43400 | 5.671000 | 15.547000 | 27.992000 | 5.550000 | 67.669000 | 10.500000 | 27.004000 | 9.710000 | 31.997000 | 11.131000 | 37.087000 | 14.195000 | 12.698000 | 3.070000 | 69.078000 | 66.923000 | 78.012000 | 71.999000 | 64.057000 | 72.826000 | 8707.900000 | 4.84600 | 9168.900000 | 16.070000 | 23.136000 | 32.390000 | 38.655000 | 0.826000 | 1.221000 | 1.694000 | 2.029000 | 91.849000 | 0.999000 | 4.577000 | 2.926000 | 2.478000 | 2.64700 | 2.272000 | 56.559000 | 3.66400 | 53.05900 | 2.500000 | 5759.800000 | 91.135000 | 52.967000 | 5.460000 | 37.11200 | 1962.100000 | 4.333000 | 0.703000 | 79560.000000 | 102790.000000 | 142730.000000 | 63170.000000 | 353.300000 | 458.800000 | 576.300000 | 223.000000 | 523.100000 | 25.580000 | 21.270000 | 12.57000 | 540.20000 | 21.400000 | 4.892000 | 45.596000 | 42.259000 | 72.017000 | 78.10500 | 1321.500000 | 321.770000 | 1195.500000 | 292.042500 | 342453.750000 | 103870.247500 | 467.72500 | 321.775000 | 93.342500 | 78.467500 | 18.317500 | 1.822500 | 0.84500 | 20.410000 | 57.250000 | 10.00000 | 110.825000 | 475.750000 | 2144.41000 | 5.266000 | 273.000000 | 6.771188e+07 | 91.565000 | 3.750000 | 1.568000 | 183933.272500 | 53.300000 | 13.211000 | 91.500000 | 62.496000 | 917.700000 | 250.089000 | 1187.900000 | 374.653000 | 2093.700000 | 1109.321000 | 6643.600000 | 4542.293000 | 1284.500000 | 463.628000 | 65.000000 | 29.056000 | 700.452000 | 6282.766000 |
| std | 67.441827 | 28123.129567 | 2.458545 | 195996.657595 | 0.183073 | 20.853535 | 21.288761 | 1.411486 | 1.969202 | 4.659438 | 6.288545 | 6.490518 | 3.757798 | 200855.131413 | 46.991043 | 8091.644803 | 5.91336 | 0.282921 | 17.13674 | 7.47369 | 2.777384 | 3.086009 | 7881.706186 | 2864.852131 | 6263.841867 | 4743.836166 | 3944.801165 | 3917.865894 | 4963.835637 | 3793.149462 | 30194.074621 | 5.25849 | 2.456413 | 6.050423 | 6.726027 | 1.799451 | 5.265294 | 7.166563 | 4.380231 | 2.491113 | 5.597257 | 2.143805 | 9.211694 | 2.128407 | 2.070452 | 0.110454 | 10.550211 | 13.091965 | 10.966617 | 10.505742 | 3.713286 | 4.229474 | 21871.270031 | 4.93539 | 18148.978878 | 7.175449 | 8.711964 | 9.665053 | 12.176344 | 0.594516 | 0.907738 | 1.151831 | 1.383598 | 2.375563 | 0.713621 | 1.920151 | 0.990311 | 0.159081 | 0.12979 | 0.180358 | 13.593108 | 2.54648 | 7.30352 | 0.527046 | 9292.176049 | 3.579349 | 13.167262 | 4.960423 | 9.26628 | 13.763478 | 2.358983 | 0.677004 | 16780.623747 | 21183.140571 | 47476.030209 | 39215.559554 | 73.063215 | 89.197409 | 110.048525 | 54.108944 | 86.992273 | 2.081559 | 1.263197 | 1.22479 | 1458.76613 | 41.414437 | 1.934378 | 14.947427 | 7.373137 | 9.247546 | 8.72826 | 2127.326413 | 332.327943 | 1917.553041 | 298.699678 | 376930.265812 | 47184.416303 | 263.97143 | 332.352317 | 4.302126 | 31.166043 | 30.793664 | 1.280478 | 1.24141 | 30.252446 | 94.026149 | 2.94392 | 85.098938 | 946.994925 | 2844.69981 | 11.358544 | 238.753709 | 9.423337e+07 | 2.175416 | 4.787136 | 2.240812 | 131168.014654 | 141.648273 | 25.019023 | 124.392971 | 31.185403 | 2215.698036 | 391.183905 | 2792.666167 | 484.606807 | 3763.327902 | 545.433585 | 10214.814233 | 1273.663253 | 2555.521397 | 447.962126 | 86.709284 | 14.479887 | 902.569245 | 2102.843563 |
| min | 1.000000 | 10675.000000 | 2.000000 | 12809.000000 | 2.420000 | 0.390000 | 29.600000 | 0.460000 | 0.450000 | 11.720000 | 24.660000 | 10.530000 | 3.480000 | 0.000000 | 0.000000 | 25434.000000 | 72.91000 | 0.420000 | 34.26000 | 9.30000 | 1.140000 | 10.970000 | 31083.000000 | 12984.000000 | 13023.000000 | 6507.000000 | 5974.000000 | 6598.000000 | 0.000000 | 7964.000000 | 453.000000 | 3.62000 | 2.810000 | 8.970000 | 18.340000 | 2.460000 | 59.670000 | 2.350000 | 21.640000 | 7.260000 | 23.610000 | 7.970000 | 28.450000 | 10.330000 | 9.280000 | 2.930000 | 41.960000 | 33.580000 | 52.230000 | 44.320000 | 58.430000 | 67.490000 | 124.000000 | 1.38000 | 473.000000 | 7.400000 | 8.880000 | 15.860000 | 15.860000 | 0.190000 | 0.230000 | 0.410000 | 0.410000 | 87.490000 | 0.110000 | 2.480000 | 1.780000 | 2.260000 | 2.50000 | 2.020000 | 31.880000 | 0.73000 | 45.87000 | 2.000000 | 259.000000 | 85.220000 | 31.090000 | 0.390000 | 25.00000 | 1939.000000 | 0.660000 | 0.250000 | 41600.000000 | 56700.000000 | 78000.000000 | 36400.000000 | 227.000000 | 299.000000 | 385.000000 | 155.000000 | 395.000000 | 23.700000 | 19.500000 | 10.90000 | 0.00000 | 0.000000 | 2.590000 | 25.240000 | 28.170000 | 52.730000 | 59.20000 | 91.000000 | 104.930000 | 83.000000 | 99.410000 | 40473.000000 | 42529.000000 | 193.40000 | 104.900000 | 89.390000 | 32.200000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 2.000000 | 6.00000 | 0.000000 | 7.900000 | 10.00000 | 0.290000 | 29.000000 | 6.124758e+06 | 90.070000 | 0.000000 | 0.000000 | 86495.300000 | 0.000000 | 0.000000 | 3.000000 | 19.630000 | 0.000000 | 0.000000 | 1.000000 | 6.540000 | 117.000000 | 443.000000 | 370.000000 | 2420.830000 | 13.000000 | 85.060000 | 2.000000 | 5.850000 | 26.170000 | 3637.430000 |
| 25% | 7.000000 | 23656.250000 | 4.000000 | 20360.250000 | 2.470000 | 0.670000 | 74.420000 | 1.580000 | 0.997500 | 12.987500 | 27.702500 | 12.900000 | 4.875000 | 2534.500000 | 17.205000 | 28699.750000 | 79.68000 | 0.775000 | 41.14250 | 12.14000 | 6.312500 | 12.342500 | 34943.250000 | 14207.750000 | 15437.500000 | 8634.500000 | 8367.500000 | 9472.000000 | 4290.750000 | 9250.250000 | 1632.250000 | 5.96000 | 3.557500 | 10.157500 | 23.682500 | 4.240000 | 63.130000 | 4.132500 | 22.905000 | 7.610000 | 28.460000 | 9.442500 | 30.772500 | 13.212500 | 11.452500 | 2.975000 | 66.645000 | 63.977500 | 76.795000 | 71.767500 | 61.520000 | 69.590000 | 329.250000 | 2.57500 | 766.500000 | 10.310000 | 17.052500 | 25.772500 | 30.862500 | 0.505000 | 0.860000 | 1.325000 | 1.507500 | 90.505000 | 0.567500 | 3.050000 | 1.950000 | 2.345000 | 2.53250 | 2.150000 | 47.117500 | 1.40500 | 47.82000 | 2.000000 | 622.750000 | 88.290000 | 43.382500 | 2.412500 | 32.25250 | 1949.750000 | 3.452500 | 0.287500 | 72875.000000 | 90350.000000 | 115875.000000 | 43800.000000 | 305.750000 | 403.500000 | 500.500000 | 180.750000 | 472.250000 | 24.075000 | 20.350000 | 11.60000 | 14.50000 | 0.250000 | 3.820000 | 35.297500 | 38.192500 | 68.627500 | 74.97000 | 222.250000 | 127.400000 | 209.750000 | 115.970000 | 90976.500000 | 88653.025000 | 352.32500 | 127.400000 | 89.742500 | 75.152500 | 3.105000 | 1.410000 | 0.00000 | 5.677500 | 10.250000 | 9.00000 | 70.950000 | 18.425000 | 474.45000 | 0.637500 | 136.250000 | 2.187468e+07 | 90.197500 | 0.000000 | 0.000000 | 108644.900000 | 0.000000 | 0.000000 | 9.750000 | 43.902500 | 3.250000 | 13.567500 | 14.250000 | 70.010000 | 142.500000 | 747.592500 | 728.750000 | 3678.982500 | 28.250000 | 142.757500 | 7.000000 | 24.230000 | 125.725000 | 5535.310000 |
| 50% | 7.000000 | 55612.500000 | 5.500000 | 29236.500000 | 2.685000 | 3.760000 | 81.515000 | 1.965000 | 2.810000 | 13.750000 | 29.405000 | 14.555000 | 9.055000 | 20169.000000 | 98.985000 | 31670.500000 | 82.05500 | 0.865000 | 47.63500 | 20.44000 | 6.830000 | 13.645000 | 37157.000000 | 14936.500000 | 16479.500000 | 11166.500000 | 9841.000000 | 11700.000000 | 8708.500000 | 9801.500000 | 3000.000000 | 11.24500 | 5.315000 | 15.710000 | 27.575000 | 6.020000 | 68.585000 | 11.115000 | 27.390000 | 9.095000 | 31.565000 | 11.875000 | 33.380000 | 14.530000 | 13.190000 | 3.050000 | 72.870000 | 70.855000 | 78.900000 | 73.125000 | 63.930000 | 71.920000 | 775.500000 | 3.33000 | 1416.500000 | 14.225000 | 22.830000 | 32.600000 | 38.995000 | 0.700000 | 1.035000 | 1.480000 | 1.725000 | 92.520000 | 0.790000 | 4.510000 | 2.990000 | 2.455000 | 2.62500 | 2.235000 | 58.445000 | 3.29500 | 50.03000 | 2.500000 | 837.500000 | 92.130000 | 53.650000 | 4.065000 | 35.80500 | 1969.000000 | 4.360000 | 0.505000 | 84200.000000 | 111600.000000 | 143950.000000 | 56350.000000 | 346.000000 | 454.000000 | 583.000000 | 220.500000 | 508.000000 | 25.100000 | 21.400000 | 12.75000 | 48.00000 | 2.000000 | 4.505000 | 45.480000 | 42.560000 | 73.640000 | 78.64000 | 344.500000 | 184.395000 | 316.500000 | 167.410000 | 228905.500000 | 107817.950000 | 425.05000 | 184.400000 | 93.330000 | 90.835000 | 4.435000 | 2.225000 | 0.37500 | 8.110000 | 14.500000 | 10.50000 | 122.700000 | 35.400000 | 1100.80000 | 1.195000 | 236.500000 | 2.826889e+07 | 90.725000 | 2.500000 | 0.000000 | 136726.300000 | 1.000000 | 4.120000 | 25.500000 | 62.655000 | 43.500000 | 127.315000 | 88.500000 | 219.560000 | 251.000000 | 907.585000 | 1623.000000 | 4727.200000 | 84.000000 | 248.415000 | 11.000000 | 36.100000 | 461.230000 | 5712.280000 |
| 75% | 17.500000 | 64937.500000 | 7.000000 | 179535.250000 | 2.777500 | 12.540000 | 96.972500 | 3.100000 | 4.787500 | 18.217500 | 34.950000 | 20.187500 | 10.740000 | 176090.500000 | 100.000000 | 41595.500000 | 88.58000 | 1.120000 | 62.96000 | 23.59750 | 8.122500 | 16.110000 | 47186.000000 | 19435.250000 | 21030.500000 | 13316.500000 | 13261.250000 | 14098.250000 | 10198.500000 | 12570.750000 | 13352.500000 | 12.46500 | 7.807500 | 18.045000 | 32.660000 | 6.422500 | 71.245000 | 14.332500 | 29.802500 | 11.115000 | 36.330000 | 12.895000 | 38.682500 | 15.995000 | 14.375000 | 3.172500 | 74.970000 | 73.390000 | 82.762500 | 77.340000 | 66.407500 | 75.075000 | 3705.500000 | 4.25750 | 9601.250000 | 21.602500 | 29.967500 | 37.975000 | 44.575000 | 0.940000 | 1.247500 | 1.572500 | 2.070000 | 92.725000 | 1.357500 | 5.300000 | 3.630000 | 2.620000 | 2.74750 | 2.385000 | 62.820000 | 5.22750 | 57.14750 | 3.000000 | 9001.250000 | 94.085000 | 58.750000 | 5.772500 | 41.64250 | 1971.750000 | 5.150000 | 0.760000 | 88600.000000 | 118750.000000 | 152350.000000 | 59500.000000 | 406.250000 | 514.250000 | 651.750000 | 270.500000 | 560.500000 | 25.850000 | 21.875000 | 13.10000 | 171.00000 | 26.250000 | 5.212500 | 57.707500 | 46.192500 | 77.885000 | 83.39250 | 1443.750000 | 378.765000 | 1302.250000 | 343.482500 | 480382.750000 | 123035.172500 | 540.45000 | 378.775000 | 96.930000 | 94.150000 | 19.647500 | 2.637500 | 1.22000 | 22.842500 | 61.500000 | 11.50000 | 162.575000 | 105.375000 | 2488.37500 | 3.132500 | 373.250000 | 7.410609e+07 | 92.092500 | 6.250000 | 2.852500 | 212014.672500 | 18.750000 | 13.750000 | 168.000000 | 80.992500 | 446.000000 | 258.875000 | 863.500000 | 477.072500 | 1565.000000 | 1458.805000 | 8562.250000 | 5570.495000 | 1100.500000 | 665.182500 | 105.000000 | 37.680000 | 874.262500 | 7276.460000 |
| max | 173.000000 | 79000.000000 | 10.000000 | 606900.000000 | 2.920000 | 65.840000 | 98.930000 | 4.820000 | 5.390000 | 26.330000 | 45.340000 | 32.370000 | 15.230000 | 606900.000000 | 100.000000 | 47924.000000 | 90.83000 | 1.270000 | 89.04000 | 33.08000 | 10.930000 | 21.120000 | 54088.000000 | 20125.000000 | 34563.000000 | 23000.000000 | 17500.000000 | 18666.000000 | 13713.000000 | 20971.000000 | 96278.000000 | 19.34000 | 9.560000 | 26.850000 | 39.120000 | 8.620000 | 75.000000 | 25.410000 | 33.610000 | 14.330000 | 40.150000 | 13.670000 | 55.640000 | 16.430000 | 15.030000 | 3.210000 | 77.390000 | 78.060000 | 95.600000 | 82.350000 | 69.420000 | 79.410000 | 70523.000000 | 18.20000 | 58887.000000 | 28.890000 | 36.960000 | 48.350000 | 57.550000 | 2.330000 | 3.590000 | 4.690000 | 5.580000 | 95.170000 | 2.550000 | 8.960000 | 4.600000 | 2.680000 | 2.84000 | 2.590000 | 74.490000 | 8.25000 | 67.33000 | 3.000000 | 28855.000000 | 95.240000 | 71.630000 | 17.400000 | 56.91000 | 1976.000000 | 8.200000 | 2.490000 | 98300.000000 | 123900.000000 | 258700.000000 | 172000.000000 | 454.000000 | 587.000000 | 736.000000 | 301.000000 | 664.000000 | 30.100000 | 23.500000 | 14.90000 | 4682.00000 | 131.000000 | 9.700000 | 67.860000 | 53.490000 | 84.460000 | 89.09000 | 4506.000000 | 813.360000 | 4066.000000 | 733.940000 | 871531.000000 | 157316.090000 | 827.40000 | 813.400000 | 97.320000 | 100.000000 | 64.400000 | 2.840000 | 2.63000 | 65.420000 | 198.000000 | 13.00000 | 197.900000 | 2686.900000 | 9538.90000 | 37.300000 | 590.000000 | 2.081850e+08 | 94.740000 | 10.000000 | 6.020000 | 375785.190000 | 454.000000 | 81.950000 | 324.000000 | 123.330000 | 7107.000000 | 1282.850000 | 9003.000000 | 1625.090000 | 11532.000000 | 2081.590000 | 31466.000000 | 6118.530000 | 8060.000000 | 1454.870000 | 200.000000 | 41.420000 | 3048.380000 | 9252.350000 |
* MODULE 3: INITIAL CODE AND RESULTS
#az = crime_data.loc[crime_data['state'] == "CA"]
#az.boxplot(column="murdPerPop")
#dc = crime_data.loc[crime_data['state']== "DC"]
Previously, we have discovered a number of missing values in the dataset. The number of values missing in each column varies. At this point, there are missing number values which are significant and cannot be used in the analysis. These columns will have to be dropped. I have decided on a NaN threshold of 50% (0.5), so any columns which are missing 50% of its values or more will be dropped from the dataset.
#after deciding on a NaN ratio threshold, drop the columns which exceed the threshold (50%)
crimedata_reduced= crime_data.drop(['countyCode', 'communityCode', 'fold', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'PolicBudgPerPop'], axis=1)
crimedata_reduced.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murders | murdPerPop | rapes | rapesPerPop | robberies | robbbPerPop | assaults | assaultPerPop | burglaries | burglPerPop | larcenies | larcPerPop | autoTheft | autoTheftPerPop | arsons | arsonsPerPop | ViolentCrimesPerPop | nonViolPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 21.44 | 10.93 | 11.33 | 11980 | 100.0 | 75122 | 89.24 | 1.55 | 70.20 | 23.62 | 1.03 | 18.39 | 79584 | 29711 | 30233 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 227 | 1.96 | 5.81 | 9.90 | 48.18 | 2.70 | 64.55 | 14.65 | 28.82 | 5.49 | 50.73 | 3.67 | 26.38 | 5.22 | 4.47 | 3.22 | 91.43 | 90.17 | 95.78 | 95.81 | 44.56 | 58.88 | 31 | 0.36 | 1277 | 8.69 | 13.00 | 20.99 | 30.93 | 0.93 | 1.39 | 2.24 | 3.30 | 85.68 | 1.37 | 4.81 | 4.17 | 2.99 | 3.00 | 2.84 | 91.46 | 0.39 | 11.06 | 3 | 64 | 98.37 | 91.01 | 3.12 | 37.50 | 1959 | 0.00 | 0.28 | 215900 | 262600 | 326900 | 111000 | 685 | 1001 | 1001 | 316 | 1001 | 23.8 | 21.1 | 14.0 | 11 | 0 | 10.66 | 53.72 | 65.29 | 78.09 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0 | 0.0 | 0.0 | 0.00 | 1.0 | 8.20 | 4.0 | 32.81 | 14.0 | 114.85 | 138.0 | 1132.08 | 16.0 | 131.26 | 2.0 | 16.41 | 41.02 | 1394.59 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 21.30 | 10.48 | 17.18 | 23123 | 100.0 | 47917 | 78.99 | 1.11 | 64.11 | 35.50 | 2.75 | 22.85 | 55323 | 20148 | 20191 | 18137 | 0 | 20074 | 5250.0 | 12222 | 885 | 3.98 | 5.61 | 13.72 | 29.89 | 2.43 | 61.96 | 12.26 | 29.28 | 6.39 | 37.64 | 4.23 | 27.99 | 6.45 | 5.42 | 3.11 | 86.91 | 85.33 | 96.82 | 86.46 | 51.14 | 62.43 | 43 | 0.24 | 1920 | 5.21 | 8.65 | 13.33 | 22.50 | 0.43 | 0.72 | 1.11 | 1.87 | 87.79 | 1.81 | 4.25 | 3.34 | 2.70 | 2.83 | 1.96 | 89.03 | 1.01 | 23.60 | 3 | 240 | 97.15 | 84.88 | 0.00 | 18.33 | 1958 | 0.31 | 0.14 | 136300 | 164200 | 199900 | 63600 | 467 | 560 | 672 | 205 | 627 | 27.6 | 20.7 | 12.5 | 0 | 0 | 8.30 | 77.17 | 71.27 | 90.22 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0 | 0.0 | 1.0 | 4.25 | 5.0 | 21.26 | 24.0 | 102.05 | 57.0 | 242.37 | 376.0 | 1598.78 | 26.0 | 110.55 | 1.0 | 4.25 | 127.56 | 1955.95 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 25.88 | 11.01 | 10.28 | 29344 | 100.0 | 35669 | 82.00 | 1.15 | 55.73 | 22.25 | 2.94 | 14.56 | 42112 | 16946 | 17103 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 1389 | 4.75 | 2.80 | 9.09 | 30.13 | 4.01 | 69.80 | 15.95 | 21.52 | 8.79 | 32.48 | 10.10 | 25.78 | 14.76 | 12.55 | 2.95 | 78.54 | 78.85 | 92.37 | 75.72 | 66.08 | 74.19 | 164 | 0.88 | 1468 | 16.42 | 23.98 | 32.08 | 35.63 | 0.82 | 1.20 | 1.61 | 1.78 | 93.11 | 1.14 | 2.97 | 2.05 | 2.42 | 2.69 | 2.06 | 64.18 | 2.03 | 47.46 | 3 | 544 | 95.68 | 57.79 | 0.92 | 7.54 | 1976 | 1.55 | 0.12 | 74700 | 90400 | 112000 | 37300 | 370 | 428 | 520 | 150 | 484 | 24.1 | 21.7 | 11.6 | 16 | 0 | 5.00 | 44.77 | 36.60 | 61.26 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 3 | 8.3 | 6.0 | 16.60 | 56.0 | 154.95 | 14.0 | 38.74 | 274.0 | 758.14 | 1797.0 | 4972.19 | 136.0 | 376.30 | 22.0 | 60.87 | 218.59 | 6167.51 |
| 3 | Gloversvillecity | NY | 16656 | 2.40 | 1.70 | 97.35 | 0.50 | 0.70 | 12.55 | 25.20 | 12.19 | 17.57 | 0 | 0.0 | 20580 | 68.15 | 0.24 | 38.95 | 39.48 | 11.71 | 18.33 | 26501 | 10810 | 10909 | 9984 | 4941 | 3541 | 2451.0 | 4391 | 2831 | 17.23 | 11.05 | 33.68 | 10.81 | 9.86 | 54.74 | 31.22 | 27.43 | 26.76 | 22.71 | 10.98 | 28.15 | 14.47 | 12.91 | 2.98 | 64.02 | 62.36 | 65.38 | 67.43 | 59.59 | 70.27 | 561 | 3.84 | 339 | 13.86 | 13.86 | 15.34 | 15.34 | 0.28 | 0.28 | 0.31 | 0.31 | 94.98 | 0.56 | 3.93 | 2.56 | 2.37 | 2.51 | 2.20 | 58.18 | 1.21 | 45.66 | 3 | 669 | 91.19 | 54.89 | 2.54 | 57.85 | 1939 | 7.00 | 0.87 | 36400 | 49600 | 66500 | 30100 | 195 | 250 | 309 | 114 | 333 | 28.7 | 20.6 | 14.5 | 0 | 0 | 2.04 | 88.71 | 56.70 | 90.17 | 96.24 | 5.2 | 3217.7 | 3.31 | 0.0 | 0 | 0.0 | 10.0 | 57.86 | 10.0 | 57.86 | 33.0 | 190.93 | 225.0 | 1301.78 | 716.0 | 4142.56 | 47.0 | 271.93 | NaN | NaN | 306.64 | NaN |
| 4 | Bemidjicity | MN | 11245 | 2.76 | 0.53 | 89.16 | 1.17 | 0.52 | 24.46 | 40.53 | 28.69 | 12.65 | 0 | 0.0 | 17390 | 69.33 | 0.55 | 42.82 | 32.16 | 11.21 | 14.43 | 24018 | 8483 | 9009 | 887 | 4425 | 3352 | 3000.0 | 1328 | 2855 | 29.99 | 12.15 | 23.06 | 25.28 | 9.08 | 52.44 | 6.89 | 36.54 | 10.94 | 27.80 | 7.51 | 50.66 | 11.64 | 9.73 | 2.98 | 58.59 | 55.20 | 66.51 | 79.17 | 61.22 | 68.94 | 402 | 4.70 | 196 | 46.94 | 56.12 | 67.86 | 69.90 | 0.82 | 0.98 | 1.18 | 1.22 | 94.64 | 0.39 | 5.23 | 3.11 | 2.35 | 2.55 | 2.12 | 58.13 | 2.94 | 55.64 | 2 | 333 | 92.45 | 53.57 | 3.90 | 42.64 | 1958 | 7.45 | 0.82 | 30600 | 43200 | 59500 | 28900 | 202 | 283 | 362 | 160 | 332 | 32.2 | 23.2 | 12.9 | 2 | 0 | 1.74 | 73.75 | 42.22 | 60.34 | 89.02 | 11.5 | 974.2 | 0.38 | 0.0 | 0 | 0.0 | NaN | NaN | 4.0 | 32.04 | 14.0 | 112.14 | 91.0 | 728.93 | 1060.0 | 8490.87 | 91.0 | 728.93 | 5.0 | 40.05 | NaN | 9988.79 |
First, I will decide on my target/dependent variables. The two I have chosen are MURDERS and ROBBERIES. Many of my potential dependent variables are missing values, and are not able to be imputed as it may introduce bias. For this reason, I have chosen these 2 categories as my dependent variables because they have the LEAST amount of missing values. Now that my dependent variables are chosen, I can move on to dealing with my independent variables.
There are still a number of columns that remain with missing values. The number of NaNs, however, were below the threshold of 0.5 so they will be used in the analysis. To deal with these missing values, we will turn to imputation. For the columns of our independent variables with 1-15 total missing values, I have chosen to impute them with the overall mode of the column. Because the distribution of the columns is skewed, I have chosen to impute with the mode value per each individual column.
However, there are still a number of columns with a significant number of missing values, but fall below the threshold. These will have to be dealt with differently. For these values, I have chosen to aggregate the data first, and then choose my imputation method. The nature of the organization of the crimes is by neighbourhood, which are each found in a state. I thought aggregrating by state was the most logical choice, as we may find more geographical, social, and economic similarties overall (but not entirely) within each state and its population. Because there are a different number of neighbourhoods represented for each state, the representation is unbalanced. Because of this, I thought the most appropriate imputation method would be to impute these missing values with the mean by state, per each column.
#impute missing values
#for columns with very small number of missing values, impute with MODE
crimedata_reduced['burglaries'].fillna(crimedata_reduced['burglaries'].mode()[0], inplace=True)
crimedata_reduced['burglPerPop'].fillna(crimedata_reduced['burglPerPop'].mode()[0], inplace=True)
crimedata_reduced['larcenies'].fillna(crimedata_reduced['larcenies'].mode()[0], inplace=True)
crimedata_reduced['larcPerPop'].fillna(crimedata_reduced['larcPerPop'].mode()[0], inplace=True)
crimedata_reduced['autoTheft'].fillna(crimedata_reduced['autoTheft'].mode()[0], inplace=True)
crimedata_reduced['autoTheftPerPop'].fillna(crimedata_reduced['autoTheftPerPop'].mode()[0], inplace=True)
crimedata_reduced['OtherPerCap'].fillna(crimedata_reduced['OtherPerCap'].mode()[0], inplace=True)
crimedata_reduced['assaults'].fillna(crimedata_reduced['assaults'].mode()[0], inplace=True)
crimedata_reduced['assaultPerPop'].fillna(crimedata_reduced['assaultPerPop'].mode()[0], inplace=True)
#Find out the total counts of how many times each state is represented in the dataset
#With this information, I have chosen to impute by the mean value of the column, aggregated by state
#crimedata_reduced[crimedata_reduced['rapes'].isnull()]
#crimedata_reduced.pivot_table(index = ['state'], aggfunc ='size')
from collections import Counter
print(Counter(crimedata_reduced['state']))
Counter({'CA': 279, 'NJ': 211, 'TX': 162, 'MA': 123, 'OH': 111, 'MI': 108, 'PA': 101, 'FL': 90, 'CT': 71, 'MN': 66, 'WI': 60, 'IN': 48, 'NY': 46, 'NC': 46, 'AL': 43, 'MO': 42, 'WA': 40, 'IL': 40, 'GA': 37, 'OK': 36, 'TN': 35, 'VA': 33, 'OR': 31, 'SC': 28, 'KY': 26, 'RI': 26, 'AR': 25, 'CO': 25, 'UT': 24, 'LA': 22, 'NH': 21, 'MS': 20, 'AZ': 20, 'IA': 20, 'ME': 17, 'WV': 14, 'MD': 12, 'NM': 10, 'SD': 9, 'ND': 8, 'WY': 7, 'ID': 7, 'NV': 5, 'VT': 4, 'AK': 3, 'KS': 1, 'DE': 1, 'DC': 1})
At this point, I will go through each state, and aggregate them one by one into their own, new data frames. For each seperate state dataframe, I will check for missing values. Rather than checking for the total number, I want the column names of where these missing values are located. I will then confirm the total length/number of rows in the dataset, and then check the total number of missing values in each column to get an idea of how much information is missing in each column per state. If the ratio is not significant, I will then move forward and impute the missing values as discussed previously, and then confirm if there are still any columns left with missing values.
Once confirmed that there are no longer any columns missing values, I will have to put these values back into the main dataset. I have filtered out the main dataset by state, so I will create a new dataframe titled crimedata_new. This new dataframe will include all the data, EXCEPT the data of the particular state I was working with. I will then append the seperate state dataframe to crimedata_new, replacing the old data of the state in which values were missing. Once again, I will confirm that all missing values were imputed as planned, by checking the count of missing values of crimedata_new, indexed by the particular state.
STATE: CA
#Separate all rows represented by the state CA, and create a dataframe with them
CA = crimedata_reduced[crimedata_reduced['state']=='CA']
CA_df= pd.DataFrame(CA)
#Check for missing values: return names of each column that carries a missing value
CA_df.columns[CA_df.isna().any()]
#Check the total number of rows represented by the particular state
len(CA_df)
#Check total number of missing values in that column to establish the ratio of missing information that exists here
CA_df['ViolentCrimesPerPop'].isna().sum()
#If the ratio of missing information is not significant, impute the missing values with the mean of that column of its own state
CA_df['ViolentCrimesPerPop'].fillna(CA_df['ViolentCrimesPerPop'].mean(), inplace = True)
#Confirm values were successfully imputed
CA_df.columns[CA_df.isna().any()]
Index([], dtype='object')
#Create a dataframe where we filter out all rows represented by the particular state
ca_filtered = crimedata_reduced[crimedata_reduced['state'] != 'CA']
#Create a new dataframe which will serve as our main, fully imputed dataframe
#In this new dataframe, take the filtered dataframe and append the imputed state dataframe to it
crimedata_new = pd.concat([ca_filtered, CA_df], ignore_index=True)
#Confirm there are no missing values in the rows of the particular state in the new main dataframe
Counter(crimedata_new[crimedata_new['state'] == "CA"].isna().any())
Counter({False: 122})
I will repeat the previous steps with every single state represented. If no columns are returned with missing values, I will move onto the next state.
STATE: NJ
NJ = crimedata_reduced[crimedata_reduced['state']=='NJ']
NJ_df= pd.DataFrame(NJ)
NJ_df.columns[NJ_df.isna().any()]
Index([], dtype='object')
STATE: TX
TX = crimedata_reduced[crimedata_reduced['state']=='TX']
TX_df= pd.DataFrame(TX)
TX_df.columns[TX_df.isna().any()]
len(TX_df)
TX_df['rapes'].isna().sum()
TX_df['rapesPerPop'].isna().sum()
TX_df['arsons'].isna().sum()
TX_df['arsonsPerPop'].isna().sum()
TX_df['ViolentCrimesPerPop'].isna().sum()
TX_df['nonViolPerPop'].isna().sum()
5
TX_df['rapes'].fillna(TX_df['rapes'].mean(), inplace = True)
TX_df['rapesPerPop'].fillna(TX_df['rapesPerPop'].mean(), inplace = True)
TX_df['arsons'].fillna(TX_df['arsons'].mean(), inplace = True)
TX_df['arsonsPerPop'].fillna(TX_df['arsonsPerPop'].mean(), inplace = True)
TX_df['ViolentCrimesPerPop'].fillna(TX_df['ViolentCrimesPerPop'].mean(), inplace = True)
TX_df['nonViolPerPop'].fillna(TX_df['nonViolPerPop'].mean(), inplace = True)
TX_df.columns[TX_df.isna().any()]
tx_filtered = crimedata_reduced[crimedata_reduced['state'] != 'TX']
crimedata_new = pd.concat([tx_filtered, TX_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "TX"].isna().any())
Counter({False: 120, True: 2})
STATE: MA
MA = crimedata_reduced[crimedata_reduced['state']=='MA']
MA_df= pd.DataFrame(MA)
MA_df.columns[MA_df.isna().any()]
len(MA_df)
MA_df['arsons'].isna().sum()
MA_df['arsonsPerPop'].isna().sum()
MA_df['ViolentCrimesPerPop'].isna().sum()
MA_df['nonViolPerPop'].isna().sum()
7
MA_df['arsons'].fillna(MA_df['arsons'].mean(), inplace = True)
MA_df['arsonsPerPop'].fillna(MA_df['arsonsPerPop'].mean(), inplace = True)
MA_df['ViolentCrimesPerPop'].fillna(MA_df['ViolentCrimesPerPop'].mean(), inplace = True)
MA_df['nonViolPerPop'].fillna(MA_df['nonViolPerPop'].mean(), inplace = True)
MA_df.columns[MA_df.isna().any()]
ma_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MA']
crimedata_new = pd.concat([ma_filtered, MA_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "MA"].isna().any())
Counter({False: 122})
STATE: OH
OH = crimedata_reduced[crimedata_reduced['state']=='OH']
OH_df= pd.DataFrame(OH)
OH_df.columns[OH_df.isna().any()]
len(OH_df)
OH_df['arsons'].isna().sum()
OH_df['arsonsPerPop'].isna().sum()
OH_df['ViolentCrimesPerPop'].isna().sum()
OH_df['nonViolPerPop'].isna().sum()
3
OH_df['arsons'].fillna(OH_df['arsons'].mean(), inplace = True)
OH_df['arsonsPerPop'].fillna(OH_df['arsonsPerPop'].mean(), inplace = True)
OH_df['ViolentCrimesPerPop'].fillna(OH_df['ViolentCrimesPerPop'].mean(), inplace = True)
OH_df['nonViolPerPop'].fillna(OH_df['nonViolPerPop'].mean(), inplace = True)
OH_df.columns[OH_df.isna().any()]
oh_filtered = crimedata_reduced[crimedata_reduced['state'] != 'OH']
crimedata_new = pd.concat([oh_filtered, OH_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "OH"].isna().any())
Counter({False: 122})
***For the state MI, I have run into a problem. Out of the 6 columns which include missing values, 3 of the columns are able to be imputed, as they are not missing a significant amount of values. However the other 3 are missing ALL values in the entire column. This particular state is represented by 108 rows. The columns, rapes, rapesPerPop and ViolentCrimesPerPop are missing all 108 values.
STATE: MI
MI = crimedata_reduced[crimedata_reduced['state']=='MI']
MI_df= pd.DataFrame(MI)
MI_df.columns[MI_df.isna().any()]
len(MI_df)
#TOTAL rows for MI was 108
#The three columns beneath are missing ALL 108 values in these columns, there are no values to work with in order to impute the mean
MI_df['ViolentCrimesPerPop'].isna().sum()
MI_df['rapes'].isna().sum()
MI_df['rapesPerPop'].isna().sum()
#The following rows are not missing an insignificant amount of values, they can be imputed as planned
MI_df['arsons'].isna().sum()
MI_df['arsonsPerPop'].isna().sum()
MI_df['nonViolPerPop'].isna().sum()
1
#Impute the columns that are appropriate to be imputed by the mean
MI_df['arsons'].fillna(MI_df['arsons'].mean(), inplace = True)
MI_df['arsonsPerPop'].fillna(MI_df['arsonsPerPop'].mean(), inplace = True)
MI_df['nonViolPerPop'].fillna(MI_df['nonViolPerPop'].mean(), inplace = True)
MI_df.columns[MI_df.isna().any()]
mi_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MI']
crimedata_new = pd.concat([mi_filtered, MI_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "MI"].isna().any())
MI_df.columns[MI_df.isna().any()]
#There are still 3 columns that have not been addressed
#These columns must be revisited
Index(['rapes', 'rapesPerPop', 'ViolentCrimesPerPop'], dtype='object')
STATE: PA
PA = crimedata_reduced[crimedata_reduced['state']=='PA']
PA_df= pd.DataFrame(PA)
PA_df.columns[PA_df.isna().any()]
len(PA_df)
PA_df['arsons'].isna().sum()
PA_df['arsonsPerPop'].isna().sum()
PA_df['nonViolPerPop'].isna().sum()
1
PA_df['arsons'].fillna(PA_df['arsons'].mean(), inplace = True)
PA_df['arsonsPerPop'].fillna(PA_df['arsonsPerPop'].mean(), inplace = True)
PA_df['nonViolPerPop'].fillna(PA_df['nonViolPerPop'].mean(), inplace = True)
PA_df.columns[PA_df.isna().any()]
pa_filtered = crimedata_reduced[crimedata_reduced['state'] != 'PA']
crimedata_new = pd.concat([pa_filtered, PA_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "PA"].isna().any())
Counter({False: 122})
STATE: FL
FL = crimedata_reduced[crimedata_reduced['state']=='FL']
FL_df= pd.DataFrame(FL)
FL_df.columns[FL_df.isna().any()]
Index([], dtype='object')
STATE: CT
CT = crimedata_reduced[crimedata_reduced['state']=='CT']
CT_df= pd.DataFrame(CT)
CT_df.columns[CT_df.isna().any()]
len(CT_df)
CT_df['ViolentCrimesPerPop'].isna().sum()
2
CT_df['ViolentCrimesPerPop'].fillna(CT_df['ViolentCrimesPerPop'].mean(), inplace = True)
CT_df.columns[CT_df.isna().any()]
ct_filtered = crimedata_reduced[crimedata_reduced['state'] != 'CT']
crimedata_new = pd.concat([ct_filtered, CT_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "CT"].isna().any())
Counter({False: 122})
STATE: MN
MN = crimedata_reduced[crimedata_reduced['state']=='MN']
MN_df= pd.DataFrame(MN)
MN_df.columns[CA_df.isna().any()]
Index([], dtype='object')
STATE: WI
WI = crimedata_reduced[crimedata_reduced['state']=='WI']
WI_df= pd.DataFrame(WI)
WI_df.columns[WI_df.isna().any()]
len(WI_df)
WI_df['arsons'].isna().sum()
WI_df['arsonsPerPop'].isna().sum()
WI_df['nonViolPerPop'].isna().sum()
1
WI_df['arsons'].fillna(WI_df['arsons'].mean(), inplace = True)
WI_df['arsonsPerPop'].fillna(WI_df['arsonsPerPop'].mean(), inplace = True)
WI_df['nonViolPerPop'].fillna(WI_df['nonViolPerPop'].mean(), inplace = True)
WI_df.columns[WI_df.isna().any()]
wi_filtered = crimedata_reduced[crimedata_reduced['state'] != 'WI']
crimedata_new = pd.concat([wi_filtered, WI_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "WI"].isna().any())
Counter({False: 122})
STATE: IN
IN = crimedata_reduced[crimedata_reduced['state']=='IN']
IN_df= pd.DataFrame(IN)
IN_df.columns[IN_df.isna().any()]
Index([], dtype='object')
STATE: NY
NY = crimedata_reduced[crimedata_reduced['state']=='NY']
NY_df= pd.DataFrame(NY)
NY_df.columns[NY_df.isna().any()]
len(NY_df)
NY_df['arsons'].isna().sum()
NY_df['arsonsPerPop'].isna().sum()
NY_df['nonViolPerPop'].isna().sum()
17
NY_df['arsons'].fillna(NY_df['arsons'].mean(), inplace = True)
NY_df['arsonsPerPop'].fillna(NY_df['arsonsPerPop'].mean(), inplace = True)
NY_df['nonViolPerPop'].fillna(NY_df['nonViolPerPop'].mean(), inplace = True)
NY_df.columns[NY_df.isna().any()]
ny_filtered = crimedata_reduced[crimedata_reduced['state'] != 'NY']
crimedata_new = pd.concat([ny_filtered, NY_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "NY"].isna().any())
Counter({False: 122})
STATE: NC
NC = crimedata_reduced[crimedata_reduced['state']=='NC']
NC_df= pd.DataFrame(NC)
NC_df.columns[NC_df.isna().any()]
Index([], dtype='object')
STATE: AL
AL = crimedata_reduced[crimedata_reduced['state']=='AL']
AL_df= pd.DataFrame(AL)
AL_df.columns[AL_df.isna().any()]
len(AL_df)
AL_df['arsons'].isna().sum()
AL_df['arsonsPerPop'].isna().sum()
AL_df['nonViolPerPop'].isna().sum()
len(AL_df)
43
AL_df['arsons'].fillna(AL_df['arsons'].mean(), inplace = True)
AL_df['arsonsPerPop'].fillna(AL_df['arsonsPerPop'].mean(), inplace = True)
AL_df['nonViolPerPop'].fillna(AL_df['nonViolPerPop'].mean(), inplace = True)
AL_df.columns[AL_df.isna().any()]
al_filtered = crimedata_reduced[crimedata_reduced['state'] != 'AL']
crimedata_new = pd.concat([al_filtered, AL_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "AL"].isna().any())
Counter({False: 122})
STATE: MO
MO = crimedata_reduced[crimedata_reduced['state']=='MO']
MO_df= pd.DataFrame(MO)
MO_df.columns[MO_df.isna().any()]
Index([], dtype='object')
STATE: WA
WA = crimedata_reduced[crimedata_reduced['state']=='WA']
WA_df= pd.DataFrame(WA)
WA_df.columns[WA_df.isna().any()]
len(WA_df)
WA_df['arsons'].isna().sum()
WA_df['arsonsPerPop'].isna().sum()
WA_df['nonViolPerPop'].isna().sum()
1
WA_df['arsons'].fillna(WA_df['arsons'].mean(), inplace = True)
WA_df['arsonsPerPop'].fillna(WA_df['arsonsPerPop'].mean(), inplace = True)
WA_df['nonViolPerPop'].fillna(WA_df['nonViolPerPop'].mean(), inplace = True)
WA_df.columns[WA_df.isna().any()]
wa_filtered = crimedata_reduced[crimedata_reduced['state'] != 'WA']
crimedata_new = pd.concat([wa_filtered, WA_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "WA"].isna().any())
Counter({False: 122})
STATE: IL **MISSING VALUES 1.0: All columns which are missing values are missing 100% of its values
IL = crimedata_reduced[crimedata_reduced['state']=='IL']
IL_df= pd.DataFrame(IL)
IL_df.columns[IL_df.isna().any()]
len(IL_df)
IL_df['rapes'].isna().sum()
IL_df['rapesPerPop'].isna().sum()
IL_df['ViolentCrimesPerPop'].isna().sum()
40
STATE: GA
GA = crimedata_reduced[crimedata_reduced['state']=='GA']
GA_df= pd.DataFrame(GA)
GA_df.columns[GA_df.isna().any()]
Index([], dtype='object')
STATE: OK
OK = crimedata_reduced[crimedata_reduced['state']=='OK']
OK_df= pd.DataFrame(OK)
OK_df.columns[OK_df.isna().any()]
Index([], dtype='object')
STATE: TN
TN = crimedata_reduced[crimedata_reduced['state']=='TN']
TN_df= pd.DataFrame(TN)
TN_df.columns[TN_df.isna().any()]
len(TN_df)
TN_df['arsons'].isna().sum()
TN_df['arsonsPerPop'].isna().sum()
TN_df['nonViolPerPop'].isna().sum()
2
TN_df['arsons'].fillna(TN_df['arsons'].mean(), inplace = True)
TN_df['arsonsPerPop'].fillna(TN_df['arsonsPerPop'].mean(), inplace = True)
TN_df['nonViolPerPop'].fillna(TN_df['nonViolPerPop'].mean(), inplace = True)
TN_df.columns[WA_df.isna().any()]
tn_filtered = crimedata_reduced[crimedata_reduced['state'] != 'TN']
crimedata_new = pd.concat([tn_filtered, TN_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "TN"].isna().any())
Counter({False: 122})
STATE: VA
VA = crimedata_reduced[crimedata_reduced['state']=='VA']
VA_df= pd.DataFrame(VA)
VA_df.columns[VA_df.isna().any()]
Index([], dtype='object')
STATE: OR
OR = crimedata_reduced[crimedata_reduced['state']=='OR']
OR_df= pd.DataFrame(OR)
OR_df.columns[OR_df.isna().any()]
Index([], dtype='object')
STATE: SC
SC = crimedata_reduced[crimedata_reduced['state']=='SC']
SC_df= pd.DataFrame(SC)
SC_df.columns[SC_df.isna().any()]
Index([], dtype='object')
STATE: KY
KY = crimedata_reduced[crimedata_reduced['state']=='KY']
KY_df= pd.DataFrame(KY)
KY_df.columns[KY_df.isna().any()]
Index([], dtype='object')
STATE: RI
RI = crimedata_reduced[crimedata_reduced['state']=='RI']
RI_df= pd.DataFrame(RI)
RI_df.columns[RI_df.isna().any()]
Index([], dtype='object')
STATE: AR
AR = crimedata_reduced[crimedata_reduced['state']=='AR']
AR_df= pd.DataFrame(AR)
AR_df.columns[AR_df.isna().any()]
Index([], dtype='object')
STATE: CO
CO = crimedata_reduced[crimedata_reduced['state']=='CO']
CO_df= pd.DataFrame(CO)
CO_df.columns[CO_df.isna().any()]
Index([], dtype='object')
STATE: UT
UT = crimedata_reduced[crimedata_reduced['state']=='UT']
UT_df= pd.DataFrame(UT)
UT_df.columns[UT_df.isna().any()]
Index([], dtype='object')
STATE: LA
LA = crimedata_reduced[crimedata_reduced['state']=='LA']
LA_df= pd.DataFrame(LA)
LA_df.columns[LA_df.isna().any()]
len(LA_df)
LA_df['arsons'].isna().sum()
LA_df['arsonsPerPop'].isna().sum()
LA_df['nonViolPerPop'].isna().sum()
3
LA_df['arsons'].fillna(LA_df['arsons'].mean(), inplace = True)
LA_df['arsonsPerPop'].fillna(LA_df['arsonsPerPop'].mean(), inplace = True)
LA_df['nonViolPerPop'].fillna(LA_df['nonViolPerPop'].mean(), inplace = True)
LA_df.columns[LA_df.isna().any()]
la_filtered = crimedata_reduced[crimedata_reduced['state'] != 'LA']
crimedata_new = pd.concat([la_filtered, LA_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "LA"].isna().any())
Counter({False: 122})
STATE: NH
NH = crimedata_reduced[crimedata_reduced['state']=='NH']
NH_df= pd.DataFrame(NH)
NH_df.columns[NH_df.isna().any()]
Index([], dtype='object')
STATE: MS
MS = crimedata_reduced[crimedata_reduced['state']=='MS']
MS_df= pd.DataFrame(MS)
MS_df.columns[MS_df.isna().any()]
len(MS_df)
MS_df['ViolentCrimesPerPop'].isna().sum()
1
MS_df['ViolentCrimesPerPop'].fillna(MS_df['ViolentCrimesPerPop'].mean(), inplace = True)
MS_df.columns[MS_df.isna().any()]
ms_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MS']
crimedata_new = pd.concat([ms_filtered, MS_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "MS"].isna().any())
Counter({False: 122})
STATE: AZ
AZ = crimedata_reduced[crimedata_reduced['state']=='AZ']
AZ_df= pd.DataFrame(AZ)
AZ_df.columns[AZ_df.isna().any()]
Index([], dtype='object')
STATE: IA **MISSING VALUES 0.85
IA = crimedata_reduced[crimedata_reduced['state']=='IA']
IA_df= pd.DataFrame(IA)
IA_df.columns[IA_df.isna().any()]
len(IA_df)
IA_df['arsons'].isna().sum()
IA_df['arsonsPerPop'].isna().sum()
IA_df['nonViolPerPop'].isna().sum()
17
STATE: ME
ME = crimedata_reduced[crimedata_reduced['state']=='ME']
ME_df= pd.DataFrame(ME)
ME_df.columns[ME_df.isna().any()]
Index([], dtype='object')
STATE: WV
WV = crimedata_reduced[crimedata_reduced['state']=='WV']
WV_df= pd.DataFrame(WV)
WV_df.columns[WV_df.isna().any()]
Index([], dtype='object')
STATE: MD
MD = crimedata_reduced[crimedata_reduced['state']=='MD']
MD_df= pd.DataFrame(MD)
MD_df.columns[MD_df.isna().any()]
len(MD_df)
MD_df['arsons'].isna().sum()
MD_df['arsonsPerPop'].isna().sum()
MD_df['nonViolPerPop'].isna().sum()
3
MD_df['arsons'].fillna(MD_df['arsons'].mean(), inplace = True)
MD_df['arsonsPerPop'].fillna(MD_df['arsonsPerPop'].mean(), inplace = True)
MD_df['nonViolPerPop'].fillna(MD_df['nonViolPerPop'].mean(), inplace = True)
MD_df.columns[MD_df.isna().any()]
md_filtered = crimedata_reduced[crimedata_reduced['state'] != 'MD']
crimedata_new = pd.concat([md_filtered, MD_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "MD"].isna().any())
Counter({False: 122})
STATE: NM
NM = crimedata_reduced[crimedata_reduced['state']=='NM']
NM_df= pd.DataFrame(NM)
NM_df.columns[NM_df.isna().any()]
Index([], dtype='object')
STATE: SD
SD = crimedata_reduced[crimedata_reduced['state']=='SD']
SD_df= pd.DataFrame(SD)
SD_df.columns[SD_df.isna().any()]
len(SD_df)
SD_df['arsons'].isna().sum()
SD_df['arsonsPerPop'].isna().sum()
SD_df['nonViolPerPop'].isna().sum()
1
SD_df['arsons'].fillna(SD_df['arsons'].mean(), inplace = True)
SD_df['arsonsPerPop'].fillna(SD_df['arsonsPerPop'].mean(), inplace = True)
SD_df['nonViolPerPop'].fillna(SD_df['nonViolPerPop'].mean(), inplace = True)
SD_df.columns[SD_df.isna().any()]
sd_filtered = crimedata_reduced[crimedata_reduced['state'] != 'SD']
crimedata_new = pd.concat([sd_filtered, SD_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "SD"].isna().any())
Counter({False: 122})
STATE: ND
ND = crimedata_reduced[crimedata_reduced['state']=='ND']
ND_df= pd.DataFrame(ND)
ND_df.columns[ND_df.isna().any()]
Index([], dtype='object')
STATE: WY
WY = crimedata_reduced[crimedata_reduced['state']=='WY']
WY_df= pd.DataFrame(WY)
WY_df.columns[WY_df.isna().any()]
Index([], dtype='object')
STATE: ID
ID = crimedata_reduced[crimedata_reduced['state']=='ID']
ID_df= pd.DataFrame(ID)
ID_df.columns[ID_df.isna().any()]
Index([], dtype='object')
STATE: NY
NY = crimedata_reduced[crimedata_reduced['state']=='NY']
NY_df= pd.DataFrame(NY)
NY_df.columns[NY_df.isna().any()]
len(NY_df)
NY_df['arsons'].isna().sum()
NY_df['arsonsPerPop'].isna().sum()
NY_df['nonViolPerPop'].isna().sum()
17
NY_df['arsons'].fillna(NY_df['arsons'].mean(), inplace = True)
NY_df['arsonsPerPop'].fillna(NY_df['arsonsPerPop'].mean(), inplace = True)
NY_df['nonViolPerPop'].fillna(NY_df['nonViolPerPop'].mean(), inplace = True)
NY_df.columns[NY_df.isna().any()]
ny_filtered = crimedata_reduced[crimedata_reduced['state'] != 'NY']
crimedata_new = pd.concat([ny_filtered, NY_df], ignore_index=True)
Counter(crimedata_new[crimedata_new['state'] == "NY"].isna().any())
Counter({False: 122})
STATE: VT **MISSING VALUES 1.0
VT = crimedata_reduced[crimedata_reduced['state']=='VT']
VT_df= pd.DataFrame(VT)
VT_df.columns[VT_df.isna().any()]
len(VT_df)
VT_df['arsons'].isna().sum()
VT_df['arsonsPerPop'].isna().sum()
VT_df['nonViolPerPop'].isna().sum()
len(VT_df)
4
STATE: AK
AK = crimedata_reduced[crimedata_reduced['state']=='AK']
AK_df= pd.DataFrame(AK)
AK_df.columns[AK_df.isna().any()]
Index([], dtype='object')
STATE: KS *MISSING VALUES 1.0
KS = crimedata_reduced[crimedata_reduced['state']=='KS']
KS_df= pd.DataFrame(KS)
KS_df.columns[KS_df.isna().any()]
len(KS_df)
KS_df['arsons'].isna().sum()
KS_df['arsonsPerPop'].isna().sum()
KS_df['nonViolPerPop'].isna().sum()
len(KS_df)
1
STATE: DE
DE = crimedata_reduced[crimedata_reduced['state']=='DE']
DE_df= pd.DataFrame(DE)
DE_df.columns[DE_df.isna().any()]
Index([], dtype='object')
STATE: DC
DC = crimedata_reduced[crimedata_reduced['state']=='DC']
DC_df= pd.DataFrame(DC)
DC_df.columns[DC_df.isna().any()]
Index([], dtype='object')
**There were a number of states that had a significant amount of missing values from some of its columns. These columns were: 'rapes', 'rapesPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', and 'nonViolPerPop'. The reason these were marked as significant was because they exceeded the missing value threshold of 0.65 of missing values. Some of the states are missing 100% of its values in a specific column.
Because of the nature of the amount of missing values, we are unable to impute these values with the mean. For states missing 100% of the values in a column, there is no mean to impute them with. That being said, the approach to their imputation must be handled differently. I have considered taking the 3-4 surrounding states of this particular state, aggregate them, find the mean of this particular column and then impute the mean into the missing values. This runs the risk of introducing bias into the dataset. The other option would be to drop these rows entirely, which would include dropping 6 entire states.
These states include: MI, AL, IL, IA, VT, KS
#Check the total number of missing values for each column that was reported to have missing values
crimedata_new['rapes'].isna().sum()
crimedata_new['rapesPerPop'].isna().sum()
crimedata_new['arsons'].isna().sum()
crimedata_new['arsonsPerPop'].isna().sum()
crimedata_new['nonViolPerPop'].isna().sum()
80
#DROP ALL ROWS WITH MISSING VALUES
crimedata_new = crimedata_new.dropna()
I have decided to drop all rows with missing values. Although we are now missing a number of entire states, the bias this may introduce (not having a fully representational sample of the United States), I believe will ultimately affect the predictive models less and introduce considerably less bias into our data as the other option of imputing the mean of surrounding areas would.
Now that we have gone through each state, imputed as necessary, and dropped rows with missing values, we will check the entire dataset for any columns with missing values that we may have missed.
#Check whole dataset for columns with missing values
crimedata_new.columns[crimedata_new.isna().any()]
len(crimedata_new)
1919
#Check datatypes again to confirm
crimedata_new.dtypes
Check the ranges of our target variables:
print(crimedata_new['murdPerPop'].min())
print(crimedata_new['murdPerPop'].max())
0.0 91.09
print(crimedata_new['robbbPerPop'].min())
print(crimedata_new['robbbPerPop'].max())
0.0 2264.13
Here we will deal with CORRELATION: I will run a correlation matrix on the entire dataset to get an idea of which attribute pairs have the highest positive and negative correlations.
pd.set_option('display.max_rows', None)
#Run correlation matrix
crime_corr = crimedata_new.corr(numeric_only=True)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(30, 10))
sns.heatmap(crime_corr)
<AxesSubplot:>
#Unstack the matrix and sort the values from lowest to highest so we can clearly see the most negatively and positively correlated pairs
sorted_mat = crime_corr.unstack().sort_values()
sorted_mat
Now we will create a dataset for each target variable, giving us 2 seperate datasets. For each dataset, we will examine the correlations between each independent variable and our dependent variable, as a step of general exploration.
We will then run a full correlation matrix on the entire target variable dataset to see which attributes are highly correlated, so we can then drop the necessary columns.
MURDER CATEGORY CORRELATIONS:
#Create dataset for the specific crime target variable
murders_data = crimedata_new.drop(['rapes', 'rapesPerPop', 'robberies', 'robbbPerPop', 'assaults', 'assaultPerPop',
'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft', 'autoTheftPerPop',
'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', 'nonViolPerPop'], axis=1)
murders_data.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murders | murdPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 21.44 | 10.93 | 11.33 | 11980 | 100.0 | 75122 | 89.24 | 1.55 | 70.20 | 23.62 | 1.03 | 18.39 | 79584 | 29711 | 30233 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 227 | 1.96 | 5.81 | 9.90 | 48.18 | 2.70 | 64.55 | 14.65 | 28.82 | 5.49 | 50.73 | 3.67 | 26.38 | 5.22 | 4.47 | 3.22 | 91.43 | 90.17 | 95.78 | 95.81 | 44.56 | 58.88 | 31 | 0.36 | 1277 | 8.69 | 13.00 | 20.99 | 30.93 | 0.93 | 1.39 | 2.24 | 3.30 | 85.68 | 1.37 | 4.81 | 4.17 | 2.99 | 3.00 | 2.84 | 91.46 | 0.39 | 11.06 | 3 | 64 | 98.37 | 91.01 | 3.12 | 37.50 | 1959 | 0.00 | 0.28 | 215900 | 262600 | 326900 | 111000 | 685 | 1001 | 1001 | 316 | 1001 | 23.8 | 21.1 | 14.0 | 11 | 0 | 10.66 | 53.72 | 65.29 | 78.09 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0 | 0.00 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 21.30 | 10.48 | 17.18 | 23123 | 100.0 | 47917 | 78.99 | 1.11 | 64.11 | 35.50 | 2.75 | 22.85 | 55323 | 20148 | 20191 | 18137 | 0 | 20074 | 5250.0 | 12222 | 885 | 3.98 | 5.61 | 13.72 | 29.89 | 2.43 | 61.96 | 12.26 | 29.28 | 6.39 | 37.64 | 4.23 | 27.99 | 6.45 | 5.42 | 3.11 | 86.91 | 85.33 | 96.82 | 86.46 | 51.14 | 62.43 | 43 | 0.24 | 1920 | 5.21 | 8.65 | 13.33 | 22.50 | 0.43 | 0.72 | 1.11 | 1.87 | 87.79 | 1.81 | 4.25 | 3.34 | 2.70 | 2.83 | 1.96 | 89.03 | 1.01 | 23.60 | 3 | 240 | 97.15 | 84.88 | 0.00 | 18.33 | 1958 | 0.31 | 0.14 | 136300 | 164200 | 199900 | 63600 | 467 | 560 | 672 | 205 | 627 | 27.6 | 20.7 | 12.5 | 0 | 0 | 8.30 | 77.17 | 71.27 | 90.22 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0 | 0.00 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 25.88 | 11.01 | 10.28 | 29344 | 100.0 | 35669 | 82.00 | 1.15 | 55.73 | 22.25 | 2.94 | 14.56 | 42112 | 16946 | 17103 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 1389 | 4.75 | 2.80 | 9.09 | 30.13 | 4.01 | 69.80 | 15.95 | 21.52 | 8.79 | 32.48 | 10.10 | 25.78 | 14.76 | 12.55 | 2.95 | 78.54 | 78.85 | 92.37 | 75.72 | 66.08 | 74.19 | 164 | 0.88 | 1468 | 16.42 | 23.98 | 32.08 | 35.63 | 0.82 | 1.20 | 1.61 | 1.78 | 93.11 | 1.14 | 2.97 | 2.05 | 2.42 | 2.69 | 2.06 | 64.18 | 2.03 | 47.46 | 3 | 544 | 95.68 | 57.79 | 0.92 | 7.54 | 1976 | 1.55 | 0.12 | 74700 | 90400 | 112000 | 37300 | 370 | 428 | 520 | 150 | 484 | 24.1 | 21.7 | 11.6 | 16 | 0 | 5.00 | 44.77 | 36.60 | 61.26 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 3 | 8.30 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 32.89 | 20.04 | 13.26 | 140494 | 100.0 | 21577 | 75.78 | 1.00 | 41.15 | 29.31 | 7.12 | 14.09 | 27705 | 11878 | 12029 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 23223 | 17.78 | 8.76 | 23.03 | 20.66 | 5.72 | 59.02 | 14.31 | 26.83 | 14.72 | 23.42 | 11.40 | 33.32 | 14.46 | 13.04 | 2.89 | 71.94 | 69.79 | 79.76 | 75.33 | 62.96 | 70.52 | 1511 | 1.58 | 2091 | 21.33 | 30.56 | 38.02 | 45.48 | 0.32 | 0.45 | 0.57 | 0.68 | 96.87 | 0.60 | 3.08 | 1.92 | 2.28 | 2.37 | 2.16 | 57.81 | 2.11 | 53.19 | 2 | 5119 | 91.81 | 55.50 | 2.09 | 26.22 | 1966 | 6.13 | 0.31 | 37700 | 53900 | 73100 | 35400 | 215 | 280 | 349 | 134 | 340 | 26.4 | 17.3 | 11.7 | 327 | 4 | 1.49 | 64.35 | 42.29 | 70.61 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 7 | 4.63 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 27.41 | 12.76 | 14.42 | 28700 | 100.0 | 42805 | 79.47 | 0.39 | 47.70 | 30.23 | 5.41 | 17.23 | 50394 | 18193 | 18276 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 1126 | 4.01 | 4.49 | 13.89 | 27.01 | 4.85 | 65.42 | 14.02 | 27.17 | 8.50 | 32.78 | 5.97 | 36.05 | 9.06 | 7.64 | 3.14 | 79.53 | 79.76 | 92.05 | 77.12 | 65.16 | 72.81 | 263 | 1.18 | 2637 | 11.38 | 16.27 | 23.93 | 27.76 | 1.05 | 1.49 | 2.20 | 2.55 | 89.98 | 0.60 | 5.08 | 3.46 | 2.55 | 2.89 | 2.09 | 64.62 | 1.47 | 47.35 | 3 | 566 | 95.11 | 56.96 | 1.41 | 34.45 | 1956 | 0.69 | 0.28 | 155100 | 179000 | 215500 | 60400 | 463 | 669 | 824 | 361 | 736 | 24.4 | 20.8 | 12.5 | 0 | 0 | 9.19 | 77.30 | 63.45 | 82.23 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0 | 0.00 |
for col in murders_data.columns:
print(col)
plt.figure(figsize=(30, 10))
heatmap = sns.heatmap(murders_data.corr(), vmin=-1, vmax=1, annot=True)
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\312952547.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. heatmap = sns.heatmap(murders_data.corr(), vmin=-1, vmax=1, annot=True)
#Examine the correlations of each column with 'murdPerPop' as our target variable
murdPerPop_corr = murders_data[murders_data.columns[1:]].corr()['murdPerPop'][:]
#Sort the values so we can see the positive and negative correlation clearly
murdPerPop_corr.sort_values()
#Examine the correlations of each columns and 'murders' as our target variable
murders_corr = murders_data[murders_data.columns[1:]].corr()['murders'][:]
#Sort the values so we can see the positive and negative correlations clearly
murders_corr.sort_values()
#murd_corr = murders_data.corr(method='spearman')
#Create a correlation matrix for the dataframe category
murd_corr = murders_data.corr(numeric_only=True).abs()
#Select the upper triangle of the matrix, excluding the diagnonal elements
murd_tri = murd_corr.where(np.triu(np.ones(murd_corr.shape),k=1).astype(bool))
#drop the columns with a correlation greater than 0.8 and make a list of those columns named 'murd_drop'
murd_drop = [column for column in murd_tri.columns if any(murd_tri[column] > 0.8)]
#drop the murd_drop columns from the dataframe
murders = murders_data.drop(murders_data[murd_drop], axis=1)
#for col in murders.columns:
#print(col)
len(murders.columns)
53
ROBEERIES CATEGORY CORRELATIONS:
robberies_data = crimedata_new.drop(['rapes', 'rapesPerPop', 'murders', 'murdPerPop', 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', 'nonViolPerPop'], axis=1)
robberies_data.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct12t29 | agePct16t24 | agePct65up | numbUrban | pctUrban | medIncome | pctWWage | pctWFarmSelf | pctWInvInc | pctWSocSec | pctWPubAsst | pctWRetire | medFamInc | perCapInc | whitePerCap | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | NumUnderPov | PctPopUnderPov | PctLess9thGrade | PctNotHSGrad | PctBSorMore | PctUnemployed | PctEmploy | PctEmplManu | PctEmplProfServ | PctOccupManu | PctOccupMgmtProf | MalePctDivorce | MalePctNevMarr | FemalePctDiv | TotalPctDiv | PersPerFam | PctFam2Par | PctKids2Par | PctYoungKids2Par | PctTeen2Par | PctWorkMomYoungKids | PctWorkMom | NumKidsBornNeverMar | PctKidsBornNeverMar | NumImmig | PctImmigRecent | PctImmigRec5 | PctImmigRec8 | PctImmigRec10 | PctRecentImmig | PctRecImmig5 | PctRecImmig8 | PctRecImmig10 | PctSpeakEnglOnly | PctNotSpeakEnglWell | PctLargHouseFam | PctLargHouseOccup | PersPerOccupHous | PersPerOwnOccHous | PersPerRentOccHous | PctPersOwnOccup | PctPersDenseHous | PctHousLess3BR | MedNumBR | HousVacant | PctHousOccup | PctHousOwnOcc | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctHousNoPhone | PctWOFullPlumb | OwnOccLowQuart | OwnOccMedVal | OwnOccHiQuart | OwnOccQrange | RentLowQ | RentMedian | RentHighQ | RentQrange | MedRent | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | NumInShelters | NumStreet | PctForeignBorn | PctBornSameState | PctSameHouse85 | PctSameCity85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | robberies | robbbPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 21.44 | 10.93 | 11.33 | 11980 | 100.0 | 75122 | 89.24 | 1.55 | 70.20 | 23.62 | 1.03 | 18.39 | 79584 | 29711 | 30233 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 227 | 1.96 | 5.81 | 9.90 | 48.18 | 2.70 | 64.55 | 14.65 | 28.82 | 5.49 | 50.73 | 3.67 | 26.38 | 5.22 | 4.47 | 3.22 | 91.43 | 90.17 | 95.78 | 95.81 | 44.56 | 58.88 | 31 | 0.36 | 1277 | 8.69 | 13.00 | 20.99 | 30.93 | 0.93 | 1.39 | 2.24 | 3.30 | 85.68 | 1.37 | 4.81 | 4.17 | 2.99 | 3.00 | 2.84 | 91.46 | 0.39 | 11.06 | 3 | 64 | 98.37 | 91.01 | 3.12 | 37.50 | 1959 | 0.00 | 0.28 | 215900 | 262600 | 326900 | 111000 | 685 | 1001 | 1001 | 316 | 1001 | 23.8 | 21.1 | 14.0 | 11 | 0 | 10.66 | 53.72 | 65.29 | 78.09 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 1.0 | 8.20 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 21.30 | 10.48 | 17.18 | 23123 | 100.0 | 47917 | 78.99 | 1.11 | 64.11 | 35.50 | 2.75 | 22.85 | 55323 | 20148 | 20191 | 18137 | 0 | 20074 | 5250.0 | 12222 | 885 | 3.98 | 5.61 | 13.72 | 29.89 | 2.43 | 61.96 | 12.26 | 29.28 | 6.39 | 37.64 | 4.23 | 27.99 | 6.45 | 5.42 | 3.11 | 86.91 | 85.33 | 96.82 | 86.46 | 51.14 | 62.43 | 43 | 0.24 | 1920 | 5.21 | 8.65 | 13.33 | 22.50 | 0.43 | 0.72 | 1.11 | 1.87 | 87.79 | 1.81 | 4.25 | 3.34 | 2.70 | 2.83 | 1.96 | 89.03 | 1.01 | 23.60 | 3 | 240 | 97.15 | 84.88 | 0.00 | 18.33 | 1958 | 0.31 | 0.14 | 136300 | 164200 | 199900 | 63600 | 467 | 560 | 672 | 205 | 627 | 27.6 | 20.7 | 12.5 | 0 | 0 | 8.30 | 77.17 | 71.27 | 90.22 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 5.0 | 21.26 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 25.88 | 11.01 | 10.28 | 29344 | 100.0 | 35669 | 82.00 | 1.15 | 55.73 | 22.25 | 2.94 | 14.56 | 42112 | 16946 | 17103 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 1389 | 4.75 | 2.80 | 9.09 | 30.13 | 4.01 | 69.80 | 15.95 | 21.52 | 8.79 | 32.48 | 10.10 | 25.78 | 14.76 | 12.55 | 2.95 | 78.54 | 78.85 | 92.37 | 75.72 | 66.08 | 74.19 | 164 | 0.88 | 1468 | 16.42 | 23.98 | 32.08 | 35.63 | 0.82 | 1.20 | 1.61 | 1.78 | 93.11 | 1.14 | 2.97 | 2.05 | 2.42 | 2.69 | 2.06 | 64.18 | 2.03 | 47.46 | 3 | 544 | 95.68 | 57.79 | 0.92 | 7.54 | 1976 | 1.55 | 0.12 | 74700 | 90400 | 112000 | 37300 | 370 | 428 | 520 | 150 | 484 | 24.1 | 21.7 | 11.6 | 16 | 0 | 5.00 | 44.77 | 36.60 | 61.26 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 56.0 | 154.95 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 32.89 | 20.04 | 13.26 | 140494 | 100.0 | 21577 | 75.78 | 1.00 | 41.15 | 29.31 | 7.12 | 14.09 | 27705 | 11878 | 12029 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 23223 | 17.78 | 8.76 | 23.03 | 20.66 | 5.72 | 59.02 | 14.31 | 26.83 | 14.72 | 23.42 | 11.40 | 33.32 | 14.46 | 13.04 | 2.89 | 71.94 | 69.79 | 79.76 | 75.33 | 62.96 | 70.52 | 1511 | 1.58 | 2091 | 21.33 | 30.56 | 38.02 | 45.48 | 0.32 | 0.45 | 0.57 | 0.68 | 96.87 | 0.60 | 3.08 | 1.92 | 2.28 | 2.37 | 2.16 | 57.81 | 2.11 | 53.19 | 2 | 5119 | 91.81 | 55.50 | 2.09 | 26.22 | 1966 | 6.13 | 0.31 | 37700 | 53900 | 73100 | 35400 | 215 | 280 | 349 | 134 | 340 | 26.4 | 17.3 | 11.7 | 327 | 4 | 1.49 | 64.35 | 42.29 | 70.61 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 136.0 | 90.05 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 27.41 | 12.76 | 14.42 | 28700 | 100.0 | 42805 | 79.47 | 0.39 | 47.70 | 30.23 | 5.41 | 17.23 | 50394 | 18193 | 18276 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 1126 | 4.01 | 4.49 | 13.89 | 27.01 | 4.85 | 65.42 | 14.02 | 27.17 | 8.50 | 32.78 | 5.97 | 36.05 | 9.06 | 7.64 | 3.14 | 79.53 | 79.76 | 92.05 | 77.12 | 65.16 | 72.81 | 263 | 1.18 | 2637 | 11.38 | 16.27 | 23.93 | 27.76 | 1.05 | 1.49 | 2.20 | 2.55 | 89.98 | 0.60 | 5.08 | 3.46 | 2.55 | 2.89 | 2.09 | 64.62 | 1.47 | 47.35 | 3 | 566 | 95.11 | 56.96 | 1.41 | 34.45 | 1956 | 0.69 | 0.28 | 155100 | 179000 | 215500 | 60400 | 463 | 669 | 824 | 361 | 736 | 24.4 | 20.8 | 12.5 | 0 | 0 | 9.19 | 77.30 | 63.45 | 82.23 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 9.0 | 30.44 |
robberies_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1919 entries, 0 to 2214 Columns: 106 entries, communityname to robbbPerPop dtypes: float64(77), int64(27), object(2) memory usage: 1.6+ MB
plt.figure(figsize=(30, 10))
sns.heatmap(robberies_data.corr(), vmin=-1, vmax=1, annot=True)
C:\Users\radon\AppData\Local\Temp\ipykernel_10788\1798950277.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(robberies_data.corr(), vmin=-1, vmax=1, annot=True)
<AxesSubplot:>
#Examine the correlations of each columns and 'robbbPerPop' as our target variable
robbbPerPop_corr = robberies_data[robberies_data.columns[1:]].corr()['robbbPerPop'][:]
#Sort the values so we can see the positive and negative correlations clearly
robbbPerPop_corr.sort_values()
#Examine the correlations of each columns and 'robberies' as our target variable
robberies_corr = robberies_data[robberies_data.columns[1:]].corr()['robberies'][:]
#Sort the values so we can see the positive and negative correlations clearly
robberies_corr.sort_values()
#Create a correlation matrix for the dataframe category
robb_corr = robberies_data.corr(numeric_only=True).abs()
#Select the upper triangle of the matrix, excluding the diagnonal elements
robb_tri = robb_corr.where(np.triu(np.ones(robb_corr.shape),k=1).astype(bool))
#drop the columns with a correlation greater than 0.8 and make a list of those columns
robb_drop = [column for column in robb_tri.columns if any(robb_tri[column] > 0.8)]
#drop the columns in the previous list from the dataframe
robberies = robberies_data.drop(robberies_data[robb_drop], axis=1)
robberies.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | robbbPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 8.20 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 21.26 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 154.95 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 90.05 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 30.44 |
robberies[robberies.columns[1:]].corr()['robbbPerPop'][:]
murders_copy[murders_copy.columns[1:]].corr()['murdPerPop'][:]
Here I have chosen to make a copy of the original murders dataset. I did not make a copy before altering the original dataset, so here I have backtracked in order to do this.
#Create a correlation matrix for the dataframe category
murd_corr2 = murders_data.corr(numeric_only=True).abs()
#Select the upper triangle of the matrix, excluding the diagnonal elements
murd_tri2 = murd_corr2.where(np.triu(np.ones(murd_corr.shape),k=1).astype(bool))
#drop the columns with a correlation greater than 0.8 and make a list of those columns named 'murd_drop'
murd_drop2 = [column for column in murd_tri2.columns if any(murd_tri2[column] > 0.8)]
#drop the murd_drop columns from the dataframe
murders_copy = murders_data.drop(murders_data[murd_drop2], axis=1)
murders_copy.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 |
We will now go through the remaining attributes to review the column summaries, as well as the distributions in order to see how balanced they are. If they are normally or close to normally distributed, we will normalize the column with a range of 0-1. If they are not normally distributed and clearly skewed/unbalanced, the column will be turned to a categorical variable with different levels. We will turn our target variables into a CATEGORICAL ATTRIBUTE. The quartiles within the column summary will establish the levels that are chosen. Lastly, we will run one more method of feature selection after normalization, INFORMATION GAIN. We will have to decide on either ENTROPY or GINI. Then we will drop any further columns if needed. This will serve as preparation of our dataset for the regression and classification models.
***RUN INFORMATION GAIN (ENTROPY OR GINI) I have attempted to run information gain on top of feature selection via correlation. I was unable to assess the issue effectively and will not be moving forward without it for now. I may revisit this at a later time.
#Examine the summaries of each attribute
murders.describe()
| population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.919000e+03 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 | 1919.000000 |
| mean | 5.222536e+04 | 2.710662 | 9.353835 | 83.515268 | 2.804054 | 8.673700 | 14.404299 | 12.009489 | 69.758046 | 33901.321001 | 0.884648 | 43.464356 | 6.773955 | 16.119192 | 11573.294424 | 12301.949453 | 14313.012507 | 9451.968213 | 11022.774883 | 11.685935 | 9.435503 | 22.991496 | 17.795654 | 24.504070 | 13.709792 | 9.176597 | 30.687061 | 73.977910 | 60.309349 | 13.662590 | 1.168708 | 65.509693 | 45.854846 | 2.623241 | 92.690594 | 2.780214 | 35.128588 | 1962.744659 | 0.433111 | 201.729026 | 26.362064 | 21.300834 | 13.033299 | 60.188791 | 51.336967 | 87.711407 | 28.031162 | 2815.384575 | 3.093622 | 1.007004 | 5.927712 |
| std | 2.051729e+05 | 0.347113 | 13.914794 | 16.325927 | 4.720112 | 15.388139 | 4.479374 | 4.847315 | 44.375875 | 13478.160054 | 0.688882 | 12.757634 | 4.474456 | 4.588814 | 9336.714385 | 15477.135843 | 9635.116167 | 8046.208476 | 5757.066299 | 8.451142 | 6.864861 | 12.521400 | 8.086259 | 6.650632 | 6.399682 | 2.806209 | 8.027055 | 10.302549 | 7.904721 | 9.680139 | 1.663076 | 14.131028 | 14.076144 | 0.519988 | 5.265741 | 3.447570 | 14.026619 | 11.160290 | 0.431390 | 87.361065 | 2.907053 | 2.948489 | 1.466011 | 17.074383 | 10.718396 | 7.488716 | 116.924373 | 2949.803130 | 4.957666 | 2.927262 | 9.038823 |
| min | 1.000500e+04 | 1.600000 | 0.000000 | 2.680000 | 0.060000 | 0.120000 | 4.580000 | 1.660000 | 0.000000 | 12908.000000 | 0.000000 | 9.020000 | 0.500000 | 3.460000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.640000 | 0.200000 | 1.630000 | 2.050000 | 8.690000 | 1.370000 | 2.130000 | 12.060000 | 32.240000 | 24.420000 | 0.000000 | 0.000000 | 13.930000 | 3.060000 | 1.000000 | 37.470000 | 0.000000 | 3.120000 | 1939.000000 | 0.000000 | 0.000000 | 14.900000 | 14.100000 | 10.100000 | 6.750000 | 11.830000 | 32.830000 | 0.900000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.430700e+04 | 2.490000 | 0.930000 | 75.885000 | 0.625000 | 0.950000 | 12.210000 | 8.875000 | 0.000000 | 23689.500000 | 0.470000 | 34.285000 | 3.370000 | 13.045000 | 6749.500000 | 6346.500000 | 8537.500000 | 5584.000000 | 7299.500000 | 4.650000 | 4.760000 | 14.080000 | 11.985000 | 20.080000 | 9.035000 | 7.130000 | 25.475000 | 67.710000 | 55.290000 | 6.965000 | 0.190000 | 56.435000 | 37.745000 | 2.000000 | 90.950000 | 0.770000 | 24.690000 | 1956.000000 | 0.170000 | 140.000000 | 24.400000 | 19.200000 | 11.900000 | 48.365000 | 44.635000 | 84.725000 | 7.200000 | 1177.650000 | 0.370000 | 0.000000 | 0.000000 |
| 50% | 2.258000e+04 | 2.660000 | 3.070000 | 89.610000 | 1.260000 | 2.430000 | 13.620000 | 11.860000 | 100.000000 | 31231.000000 | 0.690000 | 42.440000 | 5.660000 | 15.740000 | 9787.000000 | 9874.000000 | 12408.000000 | 8182.000000 | 9703.000000 | 9.500000 | 7.900000 | 19.650000 | 16.670000 | 23.360000 | 13.020000 | 9.220000 | 29.080000 | 74.780000 | 60.610000 | 12.420000 | 0.540000 | 64.960000 | 46.790000 | 3.000000 | 93.990000 | 1.730000 | 34.470000 | 1964.000000 | 0.330000 | 175.000000 | 26.200000 | 21.400000 | 12.800000 | 62.080000 | 51.890000 | 89.620000 | 13.600000 | 2007.600000 | 1.240000 | 0.000000 | 2.420000 |
| 75% | 4.308450e+04 | 2.855000 | 11.390000 | 95.965000 | 2.815000 | 8.910000 | 15.390000 | 14.545000 | 100.000000 | 41464.500000 | 1.100000 | 52.270000 | 9.100000 | 18.845000 | 14519.000000 | 14790.500000 | 17413.500000 | 11407.500000 | 13424.000000 | 17.040000 | 12.145000 | 28.910000 | 22.730000 | 27.605000 | 17.390000 | 11.130000 | 33.470000 | 81.775000 | 65.585000 | 18.035000 | 1.410000 | 75.390000 | 54.210000 | 3.000000 | 95.920000 | 3.510000 | 44.275000 | 1971.000000 | 0.560000 | 242.500000 | 28.100000 | 23.300000 | 13.800000 | 74.170000 | 58.650000 | 92.730000 | 25.350000 | 3284.800000 | 3.450000 | 0.000000 | 8.610000 |
| max | 7.322564e+06 | 5.280000 | 96.670000 | 99.630000 | 57.460000 | 95.290000 | 54.400000 | 52.770000 | 100.000000 | 123625.000000 | 6.530000 | 89.040000 | 26.920000 | 45.510000 | 212120.000000 | 480000.000000 | 106165.000000 | 137000.000000 | 54648.000000 | 48.820000 | 49.890000 | 73.630000 | 50.030000 | 62.670000 | 44.270000 | 19.090000 | 76.320000 | 93.600000 | 87.970000 | 64.290000 | 13.710000 | 96.590000 | 95.340000 | 4.000000 | 99.000000 | 39.890000 | 82.130000 | 1987.000000 | 5.330000 | 803.000000 | 35.100000 | 32.700000 | 23.400000 | 93.140000 | 78.560000 | 99.900000 | 3569.800000 | 44229.900000 | 54.330000 | 48.440000 | 91.090000 |
#Check the skew values of all attributes, sort them in ascending order
murders.skew(numeric_only=True).sort_values()
#Find square root to establish how many bins are appropriate
import math
len(murders)
math.sqrt(2215)
#Plot the distribution
import matplotlib.pyplot as plt
murders['population'].hist(bins=20)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['population'].min())
print(murders['population'].max())
#Determine bins based on quantiles
pop_bins = pd.qcut(murders['population'], q=5)
#Check the value counts of each bin to ensure they are balanced
pop_bins.value_counts()
#Create bin labels
pop_bin_labels = ['10000-13500', '13500-19000', '19000-29000', '29000-51500', '515000-7500000']
#Create bins
pop_bin = [10000, 13500, 19000, 29000, 51500, 7500000]
#Add new category
murders['pop_bins'] = pd.cut(murders['population'], bins=pop_bin, labels=pop_bin_labels)
murders.head()
10005 7322564
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 |
murders['pop_bins'].value_counts()
10000-13500 410 13500-19000 387 515000-7500000 384 19000-29000 382 29000-51500 356 Name: pop_bins, dtype: int64
murders['householdsize'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#SHAPIRO TEST HERE
# apply normalization technique, using a range of [0,1]
murders['householdsize_bins'] = (murders['householdsize'] - murders['householdsize'].min()) / (murders['householdsize'].max() - murders['householdsize'].min())
murders.head()
murders['racepctblack'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['racepctblack'].min())
print(murders['racepctblack'].max())
#Determine bins based on quantiles
racepctblack_bins = pd.qcut(murders['racepctblack'], q=4)
#Check the value counts of each bin to ensure they are balanced
racepctblack_bins.value_counts()
#Create bin labels
racepctblack_bin_labels = ['0-0.8%', '0.9-2.8%', '2.9-11.1%', '11.2-97%']
#Create new bin category
murders['racepctblack_bins'] = pd.qcut(murders['racepctblack'],
q=4,
labels=racepctblack_bin_labels)
murders.head()
0.0 96.67
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% |
murders['racepctblack_bins'].value_counts()
0-0.8% 484 11.2-97% 480 2.9-11.1% 479 0.9-2.8% 476 Name: racepctblack_bins, dtype: int64
murders['racePctWhite'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['racePctWhite'].min())
print(murders['racePctWhite'].max())
#Determine bins based on quantiles
racePctWhite_= pd.qcut(murders['racePctWhite'], q=4)
#Check the value counts of each bin to ensure they are balanced
racePctWhite_.value_counts()
#Create bin labels
racePctWhite_labels = ['0.0-75%','75-90%', '90-96%', '96-100%']
#Create bins
racePctWhite_bin = [0, 75, 90, 96, 100]
#Add new category
#murders_backup = murders
murders['racePctWhite_bins'] = pd.cut(murders['racePctWhite'], bins=racePctWhite_bin, labels=racePctWhite_labels)
#murders.head()
2.68 99.63
murders['racePctWhite_bins'].value_counts()
75-90% 518 96-100% 474 90-96% 467 0.0-75% 460 Name: racePctWhite_bins, dtype: int64
murders['racePctAsian'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['racePctAsian'].min())
print(murders['racePctAsian'].max())
#Determine bins based on quantiles
racePctAsian_bins = pd.qcut(murders['racePctAsian'], q=4)
#Check the value counts of each bin to ensure they are balanced
racePctAsian_bins.value_counts()
#Create bin labels
racePctAsian_bin_labels = ['0.0-0.6%', '0.6-1.2%', '1.2-2.6%', '2.7-57.5%']
#Create new bin category
murders_backup = murders
murders['racePctAsian_bins'] = pd.qcut(murders['racePctAsian'],
q=4,
labels=racePctAsian_bin_labels)
#murders.head()
0.06 57.46
murders['racePctAsian_bins'].value_counts()
0.0-0.6% 480 0.6-1.2% 480 2.7-57.5% 480 1.2-2.6% 479 Name: racePctAsian_bins, dtype: int64
murders[' racePctHisp'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders[' racePctHisp'].min())
print(murders[' racePctHisp'].max())
#Determine bins based on quantiles
racePctHisp_bins = pd.qcut(murders[' racePctHisp'], q=4)
#Check the value counts of each bin to ensure they are balanced
racePctHisp_bins.value_counts()
#Create bin labels
racePctHisp_bin_labels = ['0.1-0.9%', '0.9-2.2%', '2.2-7.8%', '7.8-95.3%']
#Create new bin category
murders['racePctHisp_bins'] = pd.qcut(murders[' racePctHisp'],
q=4,
labels=racePctHisp_bin_labels)
murders.head()
murders['racePctHisp_bins'].value_counts()
0.1-0.9% 485 2.2-7.8% 480 7.8-95.3% 479 0.9-2.2% 475 Name: racePctHisp_bins, dtype: int64
murders['agePct12t21'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['agePct12t21_norm'] = (murders['agePct12t21'] - murders['agePct12t21'].min()) / (murders['agePct12t21'].max() - murders['agePct12t21'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | murdPerPop_reg_target | householdsize_norm | agePct12t21_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 0.407609 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | No | 0.000000 | 0.407609 | 0.158370 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 0.331522 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.331522 | 0.129065 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 0.225543 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Yes | 0.091119 | 0.225543 | 0.136090 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 0.230978 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Yes | 0.050829 | 0.230978 | 0.271176 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 0.271739 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.271739 | 0.132276 |
murders = murders.drop(['householdsize_bins', 'agePct12t21_bins'], axis=1)
murders[' agePct65up'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['agePct65up_norm'] = (murders[' agePct65up'] - murders[' agePct65up'].min()) / (murders[' agePct65up'].max() - murders[' agePct65up'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | murdPerPop_reg_target | householdsize_norm | agePct12t21_norm | agePct65up_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | No | 0.000000 | 0.407609 | 0.158370 | 0.189200 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.331522 | 0.129065 | 0.303659 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Yes | 0.091119 | 0.225543 | 0.136090 | 0.168656 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Yes | 0.050829 | 0.230978 | 0.271176 | 0.226961 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.271739 | 0.132276 | 0.249658 |
murders = murders.drop(['agePct65up_bins'], axis=1)
murders[' pctUrban'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders[' pctUrban'].min())
print(murders[' pctUrban'].max())
#Determine bins based on quantiles
print(murders[' pctUrban'].describe())
#Create bin labels
pctUrban_bin_labels = ['0.0%', '0.1-99%', '100%']
#Create bins
pctUrban_bin = [-1, 0.1, 99, 100]
#Add new category
murders_backup = murders
murders['pctUrban_bins'] = pd.cut(murders[' pctUrban'], bins=pctUrban_bin, labels=pctUrban_bin_labels)
murders.head()
0.0 100.0 count 1919.000000 mean 69.758046 std 44.375875 min 0.000000 25% 0.000000 50% 100.000000 75% 100.000000 max 100.000000 Name: pctUrban, dtype: float64
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% |
murders['pctUrban_bins'].value_counts()
#This distribution was not able to be balanced in any meaningful way
#I decided to leave the values as is, changing them to a binary yes/no would have been unbalanced and would have lost more information
100% 1158 0.0% 532 0.1-99% 229 Name: pctUrban_bins, dtype: int64
murders[' medIncome'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders[' medIncome'].min())
print(murders[' medIncome'].max())
#Determine bins based on quantiles
medIncome_ = pd.qcut(murders[' medIncome'], q=4)
#Check the value counts of each bin to ensure they are balanced
medIncome_.value_counts()
#Create bin labels
medIncome_labels = ['$8,000-$24,000', '$24,000-$32,000', '$32,000-$42,000', '42,000-$125,000']
#Create bins
medIncome_bin = [8865, 24000, 32000, 42000, 123625]
#Add new category
murders_backup = murders
murders['medIncome_bins'] = pd.cut(murders[' medIncome'], bins=medIncome_bin, labels=medIncome_labels)
murders.head()
12908 123625
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 |
murders['medIncome_bins'].value_counts()
$24,000-$32,000 511 $8,000-$24,000 500 42,000-$125,000 458 $32,000-$42,000 450 Name: medIncome_bins, dtype: int64
murders['pctWFarmSelf'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['pctWFarmSelf'].min())
print(murders['pctWFarmSelf'].max())
#Determine bins based on quantiles
murders['pctWFarmSelf'].describe()
#Create bin labels
pctWFarmSelf_labels = ['0-0.5%', '0.5-0.7%', '0.7-1.0%', '1.0-7.0%']
#Create bins
pctWFarmSelf_bin = [-1, 0.5, 0.7, 1, 7]
#Add new category
murders['pctWFarmSelf_bins'] = pd.cut(murders['pctWFarmSelf'], bins=pctWFarmSelf_bin, labels=pctWFarmSelf_labels)
murders.head()
0.0 6.53
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% |
murders['pctWFarmSelf_bins'].value_counts()
0-0.5% 578 1.0-7.0% 553 0.5-0.7% 401 0.7-1.0% 387 Name: pctWFarmSelf_bins, dtype: int64
murders['pctWInvInc'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['pctWInvInc_norm'] = (murders['pctWInvInc'] - murders['pctWInvInc'].min()) / (murders['pctWInvInc'].max() - murders['pctWInvInc'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 |
murders[' pctWPubAsst'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders[' pctWPubAsst'].min())
print(murders[' pctWPubAsst'].max())
#Determine bins based on quantiles
pctWPubAsst_ = pd.qcut(murders[' pctWPubAsst'], q=4)
#Check the value counts of each bin to ensure they are balanced
pctWPubAsst_.value_counts()
#Create bin labels
pctWPubAsst_labels = ['0-3%', '3-5%', '5-8%', '8-45%']
#Create bins
pctWPubAsst_bin = [0, 3, 5, 8.5, 45]
#Add new category
murders['pctWPubAsst_bins'] = pd.cut(murders[' pctWPubAsst'], bins=pctWPubAsst_bin, labels=pctWPubAsst_labels)
murders.head()
0.5 26.92
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% |
murders['pctWPubAsst_bins'].value_counts()
8-45% 552 5-8% 532 3-5% 441 0-3% 394 Name: pctWPubAsst_bins, dtype: int64
murders['pctWRetire'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['pctWRetire_norm'] = (murders['pctWRetire'] - murders['pctWRetire'].min()) / (murders['pctWRetire'].max() - murders['pctWRetire'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 |
murders['blackPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['blackPerCap'].min())
print(murders['blackPerCap'].max())
#Determine bins based on quantiles
blackPerCap_ = pd.qcut(murders['blackPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
blackPerCap_.value_counts()
#Create bin labels
blackPerCap_labels = ['0-7000', '6500-10000', '10000-15000', '15000-250000']
#Create bins
blackPerCap_bin = [-1, 7000, 10000, 15000, 250000]
#Add new catrgory
murders['blackPerCap_bins'] = pd.cut(murders['blackPerCap'], bins=blackPerCap_bin, labels=blackPerCap_labels)
murders.head()
0 212120
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 |
murders['blackPerCap_bins'].value_counts()
0-7000 536 10000-15000 498 6500-10000 449 15000-250000 436 Name: blackPerCap_bins, dtype: int64
murders['indianPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['indianPerCap'].min())
print(murders['indianPerCap'].max())
#Determine bins based on quantiles
indianPerCap_ = pd.qcut(murders['indianPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
indianPerCap_.value_counts()
#Create bin labels
indianPerCap_labels = ['0-6500', '6500-10000', '10000-15000', '15000-500000']
#Create bins
indianPerCap_bin = [-1, 6500, 10000, 15000, 500000]
#Add new category
murders['indianPerCap_bins'] = pd.cut(murders['indianPerCap'], bins=indianPerCap_bin, labels=indianPerCap_labels)
murders.head()
0 480000
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 |
murders['indianPerCap_bins'].value_counts()
0-6500 497 10000-15000 481 6500-10000 474 15000-500000 467 Name: indianPerCap_bins, dtype: int64
murders['AsianPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['AsianPerCap'].min())
print(murders['AsianPerCap'].max())
#Determine bins based on quantiles
AsianPerCap_ = pd.qcut(murders['AsianPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
AsianPerCap_.value_counts()
#Create bin labels
AsianPerCap_labels = ['0-8500', '8500-12500', '12500-17500', '17500-106500']
#Create bins
AsianPerCap_bin = [-1, 8500, 12500, 17500, 106500]
#Add new category
murders['AsianPerCap_bins'] = pd.cut(murders['AsianPerCap'], bins=AsianPerCap_bin, labels=AsianPerCap_labels)
murders.head()
0 106165
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 |
murders['AsianPerCap_bins'].value_counts()
8500-12500 501 17500-106500 477 0-8500 476 12500-17500 465 Name: AsianPerCap_bins, dtype: int64
murders['OtherPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['OtherPerCap'].min())
print(murders['OtherPerCap'].max())
#Determine bins based on quantiles
OtherPerCap_ = pd.qcut(murders['OtherPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
OtherPerCap_.value_counts()
#Create bin labels
OtherPerCap_labels = ['0-5500', '5500-8000', '8000-11500', '11500-137000']
#Create bins
OtherPerCap_bin = [-1, 5500, 8000, 11500, 137000]
#Add new category
murders['OtherPerCap_bins'] = pd.cut(murders['OtherPerCap'], bins=OtherPerCap_bin, labels=OtherPerCap_labels)
murders.head()
0.0 137000.0
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 |
murders['OtherPerCap_bins'].value_counts()
8000-11500 531 11500-137000 472 0-5500 469 5500-8000 447 Name: OtherPerCap_bins, dtype: int64
murders['HispPerCap'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['HispPerCap'].min())
print(murders['HispPerCap'].max())
#Determine bins based on quantiles
HispPerCap_ = pd.qcut(murders['HispPerCap'], q=4)
#Check the value counts of each bin to ensure they are balanced
HispPerCap_.value_counts()
#Create bin labels
HispPerCap_labels = ['0-7500', '7500-1000', '1000-13500', '13500-55000']
#Create bins
HispPerCap_bin = [-1, 7500, 10000, 13500, 55000]
#Add new category
murders['HispPerCap_bins'] = pd.cut(murders['HispPerCap'], bins=HispPerCap_bin, labels=HispPerCap_labels)
murders.head()
0 54648
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 |
murders['HispPerCap_bins'].value_counts()
0-7500 522 7500-1000 487 13500-55000 469 1000-13500 441 Name: HispPerCap_bins, dtype: int64
murders['PctPopUnderPov'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctPopUnderPov'].min())
print(murders['PctPopUnderPov'].max())
#Determine bins based on quantiles
PctPopUnderPov_ = pd.qcut(murders['PctPopUnderPov'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctPopUnderPov_.value_counts()
#Create bin labels
PctPopUnderPov_labels = ['0-5%', '5-10%', '10-17%', '17-60%']
#Create bins
PctPopUnderPov_bin = [0, 5, 10, 17, 60]
#Add new category
murders['PctPopUnderPov_bins'] = pd.cut(murders['PctPopUnderPov'], bins=PctPopUnderPov_bin, labels=PctPopUnderPov_labels)
murders.head()
0.64 48.82
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% |
murders['PctPopUnderPov_bins'].value_counts()
0-5% 522 17-60% 483 5-10% 477 10-17% 437 Name: PctPopUnderPov_bins, dtype: int64
murders['PctLess9thGrade'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctLess9thGrade'].min())
print(murders['PctLess9thGrade'].max())
#Determine bins based on quantiles
PctLess9thGrade_ = pd.qcut(murders['PctLess9thGrade'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctLess9thGrade_.value_counts()
#Create bin labels
PctLess9thGrade_labels = ['0-5%', '5-8%', '8-12%', '12-50%']
#Create bins
PctLess9thGrade_bin = [0, 5, 8, 12, 50]
#Add new category
murders['PctLess9thGrade_bins'] = pd.cut(murders['PctLess9thGrade'], bins=PctLess9thGrade_bin, labels=PctLess9thGrade_labels)
murders.head()
0.2 49.89
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% |
murders['PctLess9thGrade_bins'].value_counts()
0-5% 516 12-50% 488 5-8% 462 8-12% 453 Name: PctLess9thGrade_bins, dtype: int64
murders['PctBSorMore'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctBSorMore'].min())
print(murders['PctBSorMore'].max())
#Determine bins based on quantiles
PctBSorMore_ = pd.qcut(murders['PctBSorMore'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctBSorMore_.value_counts()
#Create bin labels
PctBSorMore_labels = ['0-14', '14-19', '19-30', '30-80']
#Create bins
PctBSorMore_bin = [0, 14, 19, 30, 80]
#Add new category
murders['PctBSorMore_bins'] = pd.cut(murders['PctBSorMore'], bins=PctBSorMore_bin, labels=PctBSorMore_labels)
murders.head()
1.63 73.63
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 |
murders['PctBSorMore_bins'].value_counts()
19-30 565 0-14 473 14-19 443 30-80 438 Name: PctBSorMore_bins, dtype: int64
murders['PctEmplManu'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctEmplManu_norm'] = (murders['PctEmplManu'] - murders['PctEmplManu'].min()) / (murders['PctEmplManu'].max() - murders['PctEmplManu'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 |
murders['PctEmplProfServ'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctEmplProfServ_norm'] = (murders['PctEmplProfServ'] - murders['PctEmplProfServ'].min()) / (murders['PctEmplProfServ'].max() - murders['PctEmplProfServ'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | murdPerPop_reg_target | householdsize_norm | agePct12t21_norm | agePct65up_norm | PctEmplProfServ_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | No | 0.000000 | 0.407609 | 0.158370 | 0.189200 | 0.372916 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.331522 | 0.129065 | 0.303659 | 0.381438 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Yes | 0.091119 | 0.225543 | 0.136090 | 0.168656 | 0.237681 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Yes | 0.050829 | 0.230978 | 0.271176 | 0.226961 | 0.336050 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.271739 | 0.132276 | 0.249658 | 0.342349 |
murders = murders.drop(['PctEmplProfServ_bins'], axis=1)
murders['PctOccupManu'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctOccupManu_norm'] = (murders['PctOccupManu'] - murders['PctOccupManu'].min()) / (murders['PctOccupManu'].max() - murders['PctOccupManu'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 |
murders['MalePctDivorce'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['MalePctDivorce_norm'] = (murders['MalePctDivorce'] - murders['MalePctDivorce'].min()) / (murders['MalePctDivorce'].max() - murders['MalePctDivorce'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 |
murders['MalePctNevMarr'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['MalePctNevMarr_norm'] = (murders['MalePctNevMarr'] - murders['MalePctNevMarr'].min()) / (murders['MalePctNevMarr'].max() - murders['MalePctNevMarr'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | murdPerPop_reg_target | householdsize_norm | agePct12t21_norm | agePct65up_norm | PctEmplProfServ_norm | MalePctNevMarr_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | No | 0.000000 | 0.407609 | 0.158370 | 0.189200 | 0.372916 | 0.222845 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.331522 | 0.129065 | 0.303659 | 0.381438 | 0.247899 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Yes | 0.091119 | 0.225543 | 0.136090 | 0.168656 | 0.237681 | 0.213508 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Yes | 0.050829 | 0.230978 | 0.271176 | 0.226961 | 0.336050 | 0.330843 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.271739 | 0.132276 | 0.249658 | 0.342349 | 0.373327 |
murders = murders.drop(['MalePctNevMarr_bins'], axis=1)
murders['PctFam2Par'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctFam2Par'].min())
print(murders['PctFam2Par'].max())
#Determine bins based on quantiles
PctFam2Par_ = pd.qcut(murders['PctFam2Par'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctFam2Par_.value_counts()
#Create bin labels
PctFam2Par_labels = ['0-67%', '67-70%', '75-82%', '82-95%']
#Create bins
PctFam2Par_bin = [0, 67, 75, 82, 95]
#Add new category
murders['PctFam2Par_bins'] = pd.cut(murders['PctFam2Par'], bins=PctFam2Par_bin, labels=PctFam2Par_labels)
murders.head()
32.24 93.6
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% |
murders['PctFam2Par_bins'].value_counts()
67-70% 533 75-82% 482 82-95% 462 0-67% 442 Name: PctFam2Par_bins, dtype: int64
murders['PctWorkMomYoungKids'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctWorkMomYoungKids_norm'] = (murders['PctWorkMomYoungKids'] - murders['PctWorkMomYoungKids'].min()) / (murders['PctWorkMomYoungKids'].max() - murders['PctWorkMomYoungKids'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 |
murders['PctImmigRecent'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctImmigRecent'].min())
print(murders['PctImmigRecent'].max())
#Determine bins based on quantiles
PctImmigRecent_ = pd.qcut(murders['PctImmigRecent'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctImmigRecent_.value_counts()
#Create bin labels
PctImmigRecent_labels = ['0-7%', '7-12%', '12-18%', '18-65%']
#Create bins
PctImmigRecent_bin = [-1, 7, 12, 18, 65]
#Add new category
murders['PctImmigRecent_bins'] = pd.cut(murders['PctImmigRecent'], bins=PctImmigRecent_bin, labels=PctImmigRecent_labels)
murders.head()
0.0 64.29
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% |
murders['PctImmigRecent_bins'].value_counts()
12-18% 515 18-65% 486 0-7% 482 7-12% 436 Name: PctImmigRecent_bins, dtype: int64
murders['PctRecentImmig'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctRecentImmig'].min())
print(murders['PctRecentImmig'].max())
#Determine bins based on quantiles
PctRecentImmig_ = pd.qcut(murders['PctRecentImmig'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctRecentImmig_.value_counts()
#Create bin labels
PctRecentImmig_labels = ['0-0.2%', '0.2-0.5%', '0.5-1.3%', '1.3-13.7%']
#Create bins
PctRecentImmig_bin = [-1, 0.2, 0.5, 1.3, 13.71]
#Add new category
murders['PctRecentImmig_bins'] = pd.cut(murders['PctRecentImmig'], bins=PctRecentImmig_bin, labels=PctRecentImmig_labels)
murders.head()
0.0 13.71
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% |
murders['PctRecentImmig_bins'].value_counts()
0-0.2% 526 1.3-13.7% 516 0.5-1.3% 474 0.2-0.5% 403 Name: PctRecentImmig_bins, dtype: int64
murders['PctPersOwnOccup'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctPersOwnOccup_norm'] = (murders['PctPersOwnOccup'] - murders['PctPersOwnOccup'].min()) / (murders['PctPersOwnOccup'].max() - murders['PctPersOwnOccup'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 |
murders['PctHousLess3BR'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctHousLess3BR_norm'] = (murders['PctHousLess3BR'] - murders['PctHousLess3BR'].min()) / (murders['PctHousLess3BR'].max() - murders['PctHousLess3BR'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 |
murders['MedNumBR'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['MedNumBR'].min())
print(murders['MedNumBR'].max())
#Determine bins based on quantiles
murders['MedNumBR'].describe()
#Create bin labels
MedNumBR_labels = ['0-2.5', '2.5-4']
#Create bins
MedNumBR_bin = [0, 2.5, 4]
#Add new category
murders['MedNumBR_bins'] = pd.cut(murders['MedNumBR'], bins=MedNumBR_bin, labels=MedNumBR_labels)
murders.head()
1 4
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 |
murders['MedNumBR_bins'].value_counts()
2.5-4 1176 0-2.5 743 Name: MedNumBR_bins, dtype: int64
murders['PctHousOccup'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctHousOccup'].min())
print(murders['PctHousOccup'].max())
#Determine bins based on quantiles
PctHousOccup_ = pd.qcut(murders['PctHousOccup'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctHousOccup_.value_counts()
#Create bin labels
PctHousOccup_labels = ['30-91%', '91-94%', '94-96%', '96-99%']
#Create bins
PctHousOccup_bin = [30, 91, 94, 96, 99]
#Add new category
murders['PctHousOccup_bins'] = pd.cut(murders['PctHousOccup'], bins=PctHousOccup_bin, labels=PctHousOccup_labels)
murders.head()
37.47 99.0
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% |
murders['PctHousOccup_bins'].value_counts()
94-96% 495 30-91% 486 91-94% 475 96-99% 463 Name: PctHousOccup_bins, dtype: int64
murders['PctVacantBoarded'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctVacantBoarded'].min())
print(murders['PctVacantBoarded'].max())
#Determine bins based on quantiles
PctVacantBoarded_ = pd.qcut(murders['PctVacantBoarded'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctVacantBoarded_.value_counts()
#Create bin labels
PctVacantBoarded_labels = ['0-0.75%', '0.75-1.75%', '1.75-4%', '4-40%']
#Create bins
PctVacantBoarded_bin = [-1, 0.75, 1.75, 4, 40]
#Add new category
murders['PctVacantBoarded_bins'] = pd.cut(murders['PctVacantBoarded'], bins=PctVacantBoarded_bin, labels=PctVacantBoarded_labels)
murders.head()
0.0 39.89
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% |
murders['PctVacantBoarded_bins'].value_counts()
1.75-4% 540 0.75-1.75% 500 0-0.75% 473 4-40% 406 Name: PctVacantBoarded_bins, dtype: int64
murders['PctVacMore6Mos'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctVacMore6Mos_norm'] = (murders['PctVacMore6Mos'] - murders['PctVacMore6Mos'].min()) / (murders['PctVacMore6Mos'].max() - murders['PctVacMore6Mos'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 |
murders['MedYrHousBuilt'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['MedYrHousBuilt'].min())
print(murders['MedYrHousBuilt'].max())
#Determine bins based on quantiles
MedYrHousBuilt_ = pd.qcut(murders['MedYrHousBuilt'], q=4)
#Check the value counts of each bin to ensure they are balanced
MedYrHousBuilt_.value_counts()
#Create bin labels
MedYrHousBuilt_labels = ['1939-1956', '1956-1964', '1964-1971', '1971-1987']
#Create bins
MedYrHousBuilt_bin = [0, 1956, 1964, 1971, 1987]
#Add new category
murders['MedYrHousBuilt_bins'] = pd.cut(murders['MedYrHousBuilt'], bins=MedYrHousBuilt_bin, labels=MedYrHousBuilt_labels)
murders.head()
1939 1987
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 |
murders['MedYrHousBuilt_bins'].value_counts()
1956-1964 516 1939-1956 501 1964-1971 454 1971-1987 448 Name: MedYrHousBuilt_bins, dtype: int64
murders['PctWOFullPlumb'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctWOFullPlumb'].min())
print(murders['PctWOFullPlumb'].max())
#Determine bins based on quantiles
PctWOFullPlumb_ = pd.qcut(murders['PctWOFullPlumb'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctWOFullPlumb_.value_counts()
#Create bin labels
PctWOFullPlumb_labels = ['0-0.16%', '0.16-0.3%', '0.3-0.5%', '0.5-5%']
#Create bins
PctWOFullPlumb_bin = [-1, 0.16, 0.3, 0.5, 5.33]
#Add new category
murders['PctWOFullPlumb_bins'] = pd.cut(murders['PctWOFullPlumb'], bins=PctWOFullPlumb_bin, labels=PctWOFullPlumb_labels)
murders.head()
0.0 5.33
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% |
murders['PctWOFullPlumb_bins'].value_counts()
0.5-5% 564 0.3-0.5% 475 0-0.16% 453 0.16-0.3% 427 Name: PctWOFullPlumb_bins, dtype: int64
murders['RentQrange'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['RentQrange'].min())
print(murders['RentQrange'].max())
#Determine bins based on quantiles
RentQrange_ = pd.qcut(murders['RentQrange'], q=4)
#Check the value counts of each bin to ensure they are balanced
RentQrange_.value_counts()
#Create bin labels
RentQrange_labels = ['0-140', '140-170', '170-230', '230-805']
#Create bins
RentQrange_bin = [-1, 140, 170, 230, 803]
#Add new category
murders['RentQrange_bins'] = pd.cut(murders['RentQrange'], bins=RentQrange_bin, labels=RentQrange_labels)
murders.head()
0 803
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 |
murders['RentQrange_bins'].value_counts()
230-805 534 0-140 489 170-230 482 140-170 414 Name: RentQrange_bins, dtype: int64
murders['MedRentPctHousInc'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['MedRentPctHousInc_norm'] = (murders['MedRentPctHousInc'] - murders['MedRentPctHousInc'].min()) / (murders['MedRentPctHousInc'].max() - murders['MedRentPctHousInc'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 |
murders['MedOwnCostPctInc'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['MedOwnCostPctInc_norm'] = (murders['MedOwnCostPctInc'] - murders['MedOwnCostPctInc'].min()) / (murders['MedOwnCostPctInc'].max() - murders['MedOwnCostPctInc'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 |
murders['MedOwnCostPctIncNoMtg'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['MedOwnCostPctIncNoMtg'].min())
print(murders['MedOwnCostPctIncNoMtg'].max())
#Determine bins based on quantiles
MedOwnCostPctIncNoMtg_ = pd.qcut(murders['MedOwnCostPctIncNoMtg'], q=4)
#Check the value counts of each bin to ensure they are balanced
MedOwnCostPctIncNoMtg_.value_counts()
#Create bin labels
MedOwnCostPctIncNoMtg_labels = ['10-12%', '12-13%', '13-14%', '14-25%']
#Create bins
MedOwnCostPctIncNoMtg_bin = [10, 12, 13, 14, 25]
#Add new category
murders['MedOwnCostPctIncNoMtg_bins'] = pd.cut(murders['MedOwnCostPctIncNoMtg'], bins=MedOwnCostPctIncNoMtg_bin, labels=MedOwnCostPctIncNoMtg_labels)
murders.head()
10.1 23.4
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% |
murders['MedOwnCostPctIncNoMtg_bins'].value_counts()
12-13% 559 10-12% 538 13-14% 434 14-25% 388 Name: MedOwnCostPctIncNoMtg_bins, dtype: int64
murders['PctBornSameState'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctBornSameState'].min())
print(murders['PctBornSameState'].max())
#Determine bins based on quantiles
PctBornSameState_ = pd.qcut(murders['PctBornSameState'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctBornSameState_.value_counts()
#Create bin labels
PctBornSameState_labels = ['0-50%', '50-65%', '65-75%', '75-95%']
#Create bins
PctBornSameState_bin = [0, 50, 65, 75, 95]
#Add new category
murders['PctBornSameState_bins'] = pd.cut(murders['PctBornSameState'], bins=PctBornSameState_bin, labels=PctBornSameState_labels)
murders.head()
6.75 93.14
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% |
murders['PctBornSameState_bins'].value_counts()
0-50% 535 50-65% 520 65-75% 433 75-95% 431 Name: PctBornSameState_bins, dtype: int64
murders['PctSameHouse85'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
# apply normalization technique, using a range of [0,1]
murders['PctSameHouse85_norm'] = (murders['PctSameHouse85'] - murders['PctSameHouse85'].min()) / (murders['PctSameHouse85'].max() - murders['PctSameHouse85'].min())
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 |
murders['PctSameState85'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctSameState85'].min())
print(murders['PctSameState85'].max())
#Determine bins based on quantiles
PctSameState85_ = pd.qcut(murders['PctSameState85'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctSameState85_.value_counts()
#Create bin labels
PctSameState85_labels = ['32-85%', '85-90%', '90-93%', '93-100%']
#Create bins
PctSameState85_bin = [32, 85, 90, 93, 100]
#Add new category
murders['PctSameState85_bins'] = pd.cut(murders['PctSameState85'], bins=PctSameState85_bin, labels=PctSameState85_labels)
murders.head()
32.83 99.9
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% |
murders['PctSameState85_bins'].value_counts()
32-85% 505 85-90% 500 90-93% 473 93-100% 441 Name: PctSameState85_bins, dtype: int64
murders['LandArea'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['LandArea'].min())
print(murders['LandArea'].max())
#Determine bins based on quantiles
LandArea_ = pd.qcut(murders['LandArea'], q=4)
#Check the value counts of each bin to ensure they are balanced
LandArea_.value_counts()
#Create bin labels
LandArea_labels = ['0-7', '7-14', '14-26', '26-4000']
#Create bins
LandArea_bin = [0, 7, 14, 26, 3570]
#Add new category
murders['LandArea_bins'] = pd.cut(murders['LandArea'], bins=LandArea_bin, labels=LandArea_labels)
murders.head()
0.9 3569.8
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 |
murders['LandArea_bins'].value_counts()
7-14 525 0-7 467 26-4000 465 14-26 462 Name: LandArea_bins, dtype: int64
murders['PopDens'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PopDens'].min())
print(murders['PopDens'].max())
#Determine bins based on quantiles
PopDens_ = pd.qcut(murders['PopDens'], q=4)
#Check the value counts of each bin to ensure they are balanced
PopDens_.value_counts()
#Create bin labels
PopDens_labels = ['10-1200', '1200-2000', '2000-3300', '3300-45000']
#Create bins
PopDens_bin = [9, 1200, 2000, 3300, 45000]
#Add new category
murders['PopDens_bins'] = pd.cut(murders['PopDens'], bins=PopDens_bin, labels=PopDens_labels)
murders.head()
10.0 44229.9
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 |
murders['PopDens_bins'].value_counts()
10-1200 496 2000-3300 488 3300-45000 478 1200-2000 457 Name: PopDens_bins, dtype: int64
murders['PctUsePubTrans'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['PctUsePubTrans'].min())
print(murders['PctUsePubTrans'].max())
#Determine bins based on quantiles
PctUsePubTrans_ = pd.qcut(murders['PctUsePubTrans'], q=4)
#Check the value counts of each bin to ensure they are balanced
PctUsePubTrans_.value_counts()
#Create bin labels
PctUsePubTrans_labels = ['0-0.4%', '0.4-1.2%', '1.2-3.3%', '3.3-55%']
#Create bins
PctUsePubTrans_bin = [-1, 0.4, 1.2, 3.3, 55]
#Add new category
murders['PctUsePubTrans_bins'] = pd.cut(murders['PctUsePubTrans'], bins=PctUsePubTrans_bin, labels=PctUsePubTrans_labels)
murders.head()
0.0 54.33
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 3.3-55% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 3.3-55% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 3.3-55% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.4-1.2% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 3.3-55% |
murders['PctUsePubTrans_bins'].value_counts()
0-0.4% 516 3.3-55% 495 1.2-3.3% 487 0.4-1.2% 421 Name: PctUsePubTrans_bins, dtype: int64
murders['LemasPctOfficDrugUn'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
#Check the minimum and maximum value
print(murders['LemasPctOfficDrugUn'].min())
print(murders['LemasPctOfficDrugUn'].max())
#Determine bins based on quantiles
murders['LemasPctOfficDrugUn'].describe()
#Create bin labels
PctUsePubTrans_labels = ['0%', '0.5-50%']
#Create bins
PctUsePubTrans_bin = [-1, 0.44, 60]
#Add new category
murders['PctUsePubTrans_bins'] = pd.cut(murders['PctUsePubTrans'], bins=PctUsePubTrans_bin, labels=PctUsePubTrans_labels)
murders.head()
0.0 48.44
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% |
murders['PctUsePubTrans_bins'].value_counts()
0.5-50% 1386 0% 533 Name: PctUsePubTrans_bins, dtype: int64
murders['murdPerPop'].hist(bins=50)
plt.title('Distribution')
plt.xlabel('x lable')
plt.ylabel('y lable')
plt.show()
murders_backup = murders
Prepare the murder dataset for CLASSIFICATION. Here, our target variable will be transformed into a binary categorical variable:
#Check the minimum and maximum value
print(murders['murdPerPop'].min())
print(murders['murdPerPop'].max())
#Determine bins based on quantiles
murdPerPop_ = pd.qcut(murders['murdPerPop'], q=2)
#Check the value counts of each bin to ensure they are balanced
murdPerPop_.value_counts()
#Create bin labels
murdPerPop_labels = ['No', 'Yes']
#Create bins
murdPerPop_bin = [-1, 0, 100]
#Add new category
murders['murdPerPop_class_target'] = pd.cut(murders['murdPerPop'], bins=murdPerPop_bin, labels=murdPerPop_labels)
murders.head()
0.0 91.09
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | householdsize_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_bins | agePct65up_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_bins | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_bins | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 2.8-5.3 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 12.3-13.6% | 8.8-11.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 28-65% | 0.096037 | 0.090802 | 25-29% | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | No |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 2.6-2.8 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 4.5-12.2% | 14.4-52.7% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 28-65% | 0.117016 | 0.123821 | 25-29% | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 1.5-2.5 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 4.5-12.2% | 8.8-11.7% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 20-23% | 0.172960 | 0.469929 | 25-29% | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Yes |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 1.5-2.5 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 15.4-54.4% | 11.7-14.4% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 23-28% | 0.311189 | 0.546580 | 29-34% | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Yes |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 2.5-2.6 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 4.5-12.2% | 11.7-14.4% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 23-28% | 0.166200 | 0.226415 | 34-77% | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No |
murders['murdPerPop_class_target'].value_counts()
Yes 1043 No 876 Name: murdPerPop_class_target, dtype: int64
murders_classification = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins',
'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins',
'medIncome_bins', 'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm',
'blackPerCap_bins', 'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins',
'PctPopUnderPov_bins', 'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm',
'PctOccupManu_norm','MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm',
'PctImmigRecent_bins','PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins',
'PctHousOccup_bins','PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins',
'RentQrange_bins','MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins',
'PctBornSameState_bins','PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins',
'PctUsePubTrans_bins', 'murdPerPop_class_target']].copy()
murders_classification.head()
murders_classification.info()
murders_class_backup = murders_classification
murders_class_backup.info()
murders_classification['state'] = murders_classification['state'].astype('category')
murders_class_backup = murders_classification
Prepare the murder dataset for REGRESSION. Here, the target variable will be normalized numerically.
***min/max scalar function
# apply normalization technique, using a range of [0,1]
murders['murdPerPop_reg_target'] = (murders['murdPerPop'] - murders['murdPerPop'].min()) / (murders['murdPerPop'].max() - murders['murdPerPop'].min())
murders.head()
murders_regression = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins',
'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins',
'medIncome_bins', 'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm',
'blackPerCap_bins', 'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins',
'PctPopUnderPov_bins', 'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm',
'PctOccupManu_norm','MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm',
'PctImmigRecent_bins','PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins',
'PctHousOccup_bins','PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins',
'RentQrange_bins','MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins',
'PctBornSameState_bins','PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins',
'PctUsePubTrans_bins', 'murdPerPop_reg_target']].copy()
murders_regression.head()
| state | pop_bins | householdsize_norm | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_norm | agePct65up_norm | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_reg_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NJ | 10000-13500 | 0.407609 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.158370 | 0.189200 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 0.372916 | 0.096037 | 0.090802 | 0.222845 | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | 0.000000 |
| 1 | PA | 19000-29000 | 0.331522 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 0.129065 | 0.303659 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 0.381438 | 0.117016 | 0.123821 | 0.247899 | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | 0.000000 |
| 2 | OR | 29000-51500 | 0.225543 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.136090 | 0.168656 | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 0.237681 | 0.172960 | 0.469929 | 0.213508 | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | 0.091119 |
| 4 | MO | 515000-7500000 | 0.230978 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 0.271176 | 0.226961 | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 0.336050 | 0.311189 | 0.546580 | 0.330843 | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | 0.050829 |
| 5 | MA | 19000-29000 | 0.271739 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 0.132276 | 0.249658 | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 0.342349 | 0.166200 | 0.226415 | 0.373327 | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | 0.000000 |
murders_reg_backup = murders_regression
murders_regression['state'] = murders_regression['state'].astype('category')
Here, I want to make sure both murders and robberies datasets contain the same columns, to avoid repeating the same normalization steps.
print(murders_copy.columns.difference(robberies.columns))
print(robberies.columns.difference(murders_copy.columns))
Index(['murdPerPop'], dtype='object') Index(['robbbPerPop'], dtype='object')
Both dataframes have the exact columns, other than their target variables. Because of this, I will only normalize the robbbPerPop target variable, and create new datasets in preparation for our regression and classification models using previously normalized columns from the murders dataset.
murders.head()
| communityname | state | population | householdsize | racepctblack | racePctWhite | racePctAsian | racePctHisp | agePct12t21 | agePct65up | pctUrban | medIncome | pctWFarmSelf | pctWInvInc | pctWPubAsst | pctWRetire | blackPerCap | indianPerCap | AsianPerCap | OtherPerCap | HispPerCap | PctPopUnderPov | PctLess9thGrade | PctBSorMore | PctEmplManu | PctEmplProfServ | PctOccupManu | MalePctDivorce | MalePctNevMarr | PctFam2Par | PctWorkMomYoungKids | PctImmigRecent | PctRecentImmig | PctPersOwnOccup | PctHousLess3BR | MedNumBR | PctHousOccup | PctVacantBoarded | PctVacMore6Mos | MedYrHousBuilt | PctWOFullPlumb | RentQrange | MedRentPctHousInc | MedOwnCostPctInc | MedOwnCostPctIncNoMtg | PctBornSameState | PctSameHouse85 | PctSameState85 | LandArea | PopDens | PctUsePubTrans | LemasPctOfficDrugUn | murdPerPop | pop_bins | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctOccupManu_norm | MalePctDivorce_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | murdPerPop_reg_target | householdsize_norm | agePct12t21_norm | agePct65up_norm | PctEmplProfServ_norm | MalePctNevMarr_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BerkeleyHeightstownship | NJ | 11980 | 3.10 | 1.37 | 91.78 | 6.50 | 1.88 | 12.47 | 11.33 | 100.0 | 75122 | 1.55 | 70.20 | 1.03 | 18.39 | 13600 | 5725 | 27101 | 5115.0 | 22838 | 1.96 | 5.81 | 48.18 | 14.65 | 28.82 | 5.49 | 3.67 | 26.38 | 91.43 | 44.56 | 8.69 | 0.93 | 91.46 | 11.06 | 3 | 98.37 | 3.12 | 37.50 | 1959 | 0.28 | 316 | 23.8 | 21.1 | 14.0 | 53.72 | 65.29 | 89.14 | 6.5 | 1845.9 | 9.63 | 0.0 | 0.00 | 10000-13500 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 0.096037 | 0.090802 | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | No | 0.000000 | 0.407609 | 0.158370 | 0.189200 | 0.372916 | 0.222845 |
| 1 | Marpletownship | PA | 23123 | 2.82 | 0.80 | 95.57 | 3.44 | 0.85 | 11.01 | 17.18 | 100.0 | 47917 | 1.11 | 64.11 | 2.75 | 22.85 | 18137 | 0 | 20074 | 5250.0 | 12222 | 3.98 | 5.61 | 29.89 | 12.26 | 29.28 | 6.39 | 4.23 | 27.99 | 86.91 | 51.14 | 5.21 | 0.43 | 89.03 | 23.60 | 3 | 97.15 | 0.00 | 18.33 | 1958 | 0.14 | 205 | 27.6 | 20.7 | 12.5 | 77.17 | 71.27 | 96.12 | 10.6 | 2186.7 | 3.84 | 0.0 | 0.00 | 19000-29000 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 0.117016 | 0.123821 | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.331522 | 0.129065 | 0.303659 | 0.381438 | 0.247899 |
| 2 | Tigardcity | OR | 29344 | 2.43 | 0.74 | 94.33 | 3.43 | 2.35 | 11.36 | 10.28 | 100.0 | 35669 | 1.15 | 55.73 | 2.94 | 14.56 | 16644 | 21606 | 15528 | 5954.0 | 8405 | 4.75 | 2.80 | 30.13 | 15.95 | 21.52 | 8.79 | 10.10 | 25.78 | 78.54 | 66.08 | 16.42 | 0.82 | 64.18 | 47.46 | 3 | 95.68 | 0.92 | 7.54 | 1976 | 0.12 | 150 | 24.1 | 21.7 | 11.6 | 44.77 | 36.60 | 82.85 | 10.6 | 2780.9 | 4.37 | 0.0 | 8.30 | 29000-51500 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 0.172960 | 0.469929 | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Yes | 0.091119 | 0.225543 | 0.136090 | 0.168656 | 0.237681 | 0.213508 |
| 4 | Springfieldcity | MO | 140494 | 2.45 | 2.51 | 95.65 | 0.90 | 0.95 | 18.09 | 13.26 | 100.0 | 21577 | 1.00 | 41.15 | 7.12 | 14.09 | 7382 | 10264 | 10753 | 7192.0 | 8104 | 17.78 | 8.76 | 20.66 | 14.31 | 26.83 | 14.72 | 11.40 | 33.32 | 71.94 | 62.96 | 21.33 | 0.32 | 57.81 | 53.19 | 2 | 91.81 | 2.09 | 26.22 | 1966 | 0.31 | 134 | 26.4 | 17.3 | 11.7 | 64.35 | 42.29 | 85.66 | 70.4 | 1995.7 | 0.97 | 0.0 | 4.63 | 515000-7500000 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 0.311189 | 0.546580 | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Yes | 0.050829 | 0.230978 | 0.271176 | 0.226961 | 0.336050 | 0.330843 |
| 5 | Norwoodtown | MA | 28700 | 2.60 | 1.60 | 96.57 | 1.47 | 1.10 | 11.17 | 14.42 | 100.0 | 42805 | 0.39 | 47.70 | 5.41 | 17.23 | 17342 | 21482 | 12639 | 21852.0 | 22594 | 4.01 | 4.49 | 27.01 | 14.02 | 27.17 | 8.50 | 5.97 | 36.05 | 79.53 | 65.16 | 11.38 | 1.05 | 64.62 | 47.35 | 3 | 95.11 | 1.41 | 34.45 | 1956 | 0.28 | 361 | 24.4 | 20.8 | 12.5 | 77.30 | 63.45 | 93.53 | 10.9 | 2643.5 | 9.62 | 0.0 | 0.00 | 19000-29000 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 0.166200 | 0.226415 | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | No | 0.000000 | 0.271739 | 0.132276 | 0.249658 | 0.342349 | 0.373327 |
#Create our normalized robberies dataframes
#REGRESSION DATASET
robberies_regression = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins',
'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins', 'medIncome_bins',
'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm', 'blackPerCap_bins',
'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins', 'PctPopUnderPov_bins',
'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm','PctOccupManu_norm',
'MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm','PctImmigRecent_bins',
'PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins','PctHousOccup_bins',
'PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins','RentQrange_bins',
'MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins','PctBornSameState_bins',
'PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins','PctUsePubTrans_bins']].copy()
robberies_regression.info()
robberies_regression['state'] = robberies_regression['state'].astype('category')
# apply normalization technique, using a range of [0,1]
robberies_regression['robbbPerPop_reg_target'] = (robberies['robbbPerPop'] - robberies['robbbPerPop'].min()) / (robberies['robbbPerPop'].max() - robberies['robbbPerPop'].min())
robberies_regression.head()
#CLASSIFICATION DATASET
robberies_classification = murders[['state','pop_bins','householdsize_norm', 'racepctblack_bins', 'racePctWhite_bins',
'racePctAsian_bins', 'racePctHisp_bins', 'agePct12t21_norm', 'agePct65up_norm', 'pctUrban_bins',
'medIncome_bins', 'pctWFarmSelf_bins', 'pctWInvInc_norm', 'pctWPubAsst_bins', 'pctWRetire_norm',
'blackPerCap_bins', 'indianPerCap_bins', 'AsianPerCap_bins', 'OtherPerCap_bins', 'HispPerCap_bins',
'PctPopUnderPov_bins', 'PctLess9thGrade_bins', 'PctBSorMore_bins','PctEmplManu_norm','PctEmplProfServ_norm',
'PctOccupManu_norm','MalePctDivorce_norm','MalePctNevMarr_norm','PctFam2Par_bins','PctWorkMomYoungKids_norm',
'PctImmigRecent_bins','PctRecentImmig_bins','PctPersOwnOccup_norm','PctHousLess3BR_norm','MedNumBR_bins',
'PctHousOccup_bins','PctVacantBoarded_bins','PctVacMore6Mos_norm','MedYrHousBuilt_bins','PctWOFullPlumb_bins',
'RentQrange_bins','MedRentPctHousInc_norm','MedOwnCostPctInc_norm','MedOwnCostPctIncNoMtg_bins',
'PctBornSameState_bins','PctSameHouse85_norm','PctSameState85_bins','LandArea_bins','PopDens_bins',
'PctUsePubTrans_bins']].copy()
robberies_classification.info()
robberies_classification['state']=robberies_classification['state'].astype('category')
#Check the minimum and maximum value
print(robberies['robbbPerPop'].min())
print(robberies['robbbPerPop'].max())
#Determine bins based on quantiles
robbbPerPop_ = pd.qcut(robberies['robbbPerPop'], q=3)
#Check the value counts of each bin to ensure they are balanced
robbbPerPop_.value_counts()
#Create bin labels
robbbPerPop_labels = ['Unlikely', 'Likely', 'Very Likely']
#Create bins
robbbPerPop_bin = [-1, 40, 145, 2500]
#Add new category
robberies_classification['robbbPerPop_class_target'] = pd.cut(robberies['robbbPerPop'], bins=robbbPerPop_bin, labels=robbbPerPop_labels)
robberies_classification.head()
0.0 2264.13
| state | pop_bins | householdsize_norm | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_norm | agePct65up_norm | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | robbbPerPop_class_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NJ | 10000-13500 | 0.407609 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.158370 | 0.189200 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 0.372916 | 0.096037 | 0.090802 | 0.222845 | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | Unlikely |
| 1 | PA | 19000-29000 | 0.331522 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 0.129065 | 0.303659 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 0.381438 | 0.117016 | 0.123821 | 0.247899 | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | Unlikely |
| 2 | OR | 29000-51500 | 0.225543 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.136090 | 0.168656 | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 0.237681 | 0.172960 | 0.469929 | 0.213508 | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Very Likely |
| 4 | MO | 515000-7500000 | 0.230978 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 0.271176 | 0.226961 | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 0.336050 | 0.311189 | 0.546580 | 0.330843 | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Likely |
| 5 | MA | 19000-29000 | 0.271739 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 0.132276 | 0.249658 | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 0.342349 | 0.166200 | 0.226415 | 0.373327 | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | Unlikely |
robberies_classification['robbbPerPop_class_target'].value_counts()
Likely 656 Very Likely 642 Unlikely 621 Name: robbbPerPop_class_target, dtype: int64
#MAKING COPIES OF ALL DATASETS FOR BACKUP
murd_reg_copy = murders_regression.copy()
rob_reg_copy = robberies_regression.copy()
murders_class_copy = murders_classification.copy()
rob_class_copy = robberies_classification.copy()
murders_reg_backup = murders_regression
robberies_reg_backup = robberies_regression
murdregcopy = murders_regression.copy()
MURDERS DATASET:
#Make a backup copy of our dataset
murdclasscopy = murders_class_copy
#Encode the cateogorical variables with LABEL ENCODING METHOD
#This code block is a test run, encoding the column 'state'
#If successful, the rest of the categorical variables will be encoded the same way
from sklearn.preprocessing import LabelEncoder
#create instance of label encoder
lab = LabelEncoder()
#perform label encoding on 'team' column
murdclasscopy['state'] = lab.fit_transform(murdclasscopy['state'])
#Encode the rest of the categorical variables with for loop function
for cols in murdclasscopy.columns:
if murdclasscopy[cols].dtype == 'category':
murdclasscopy[cols] = lab.fit_transform(murdclasscopy[cols])
else:
pass
murdclasscopy.head()
| state | pop_bins | householdsize_norm | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_norm | agePct65up_norm | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | murdPerPop_class_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 0 | 0.407609 | 1 | 2 | 3 | 1 | 0.158370 | 0.189200 | 2 | 3 | 3 | 0.764559 | 0 | 0.355054 | 1 | 0 | 2 | 0 | 2 | 0 | 2 | 3 | 0.262609 | 0.372916 | 0.096037 | 0.090802 | 0.222845 | 3 | 0.316916 | 3 | 2 | 0.937939 | 0.086693 | 1 | 3 | 2 | 0.435135 | 1 | 1 | 3 | 0.440594 | 0.376344 | 2 | 1 | 0.801139 | 1 | 0 | 1 | 1 | 0 |
| 1 | 32 | 2 | 0.331522 | 0 | 2 | 3 | 0 | 0.129065 | 0.303659 | 2 | 3 | 3 | 0.688453 | 0 | 0.461118 | 2 | 0 | 2 | 0 | 1 | 0 | 2 | 2 | 0.212797 | 0.381438 | 0.117016 | 0.123821 | 0.247899 | 3 | 0.420456 | 0 | 1 | 0.908541 | 0.222583 | 1 | 3 | 0 | 0.192507 | 1 | 0 | 2 | 0.628713 | 0.354839 | 1 | 3 | 0.890754 | 3 | 3 | 2 | 1 | 0 |
| 2 | 31 | 3 | 0.225543 | 0 | 2 | 3 | 1 | 0.136090 | 0.168656 | 2 | 1 | 3 | 0.583729 | 0 | 0.263971 | 2 | 2 | 1 | 2 | 3 | 0 | 0 | 3 | 0.289704 | 0.237681 | 0.172960 | 0.469929 | 0.213508 | 2 | 0.655547 | 1 | 2 | 0.607912 | 0.481144 | 1 | 2 | 1 | 0.055942 | 3 | 0 | 1 | 0.455446 | 0.408602 | 0 | 0 | 0.371197 | 0 | 3 | 2 | 1 | 1 |
| 4 | 20 | 4 | 0.230978 | 1 | 2 | 1 | 0 | 0.271176 | 0.226961 | 2 | 2 | 2 | 0.401525 | 2 | 0.252794 | 3 | 1 | 3 | 2 | 3 | 2 | 3 | 2 | 0.255523 | 0.336050 | 0.311189 | 0.546580 | 0.330843 | 1 | 0.606452 | 2 | 1 | 0.530849 | 0.543238 | 0 | 1 | 2 | 0.292368 | 2 | 2 | 0 | 0.569307 | 0.172043 | 0 | 1 | 0.456466 | 1 | 2 | 1 | 1 | 1 |
| 5 | 16 | 2 | 0.271739 | 1 | 3 | 2 | 1 | 0.132276 | 0.249658 | 2 | 3 | 0 | 0.483379 | 2 | 0.327467 | 2 | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 0.249479 | 0.342349 | 0.166200 | 0.226415 | 0.373327 | 2 | 0.641070 | 3 | 2 | 0.613235 | 0.479952 | 1 | 2 | 1 | 0.396532 | 0 | 1 | 3 | 0.470297 | 0.360215 | 1 | 3 | 0.773565 | 3 | 3 | 2 | 1 | 0 |
#check info of our dataframe to ensure it is of the correct type, and that all of the categorical attributes were transformed
murdclasscopy.info()
Now we are ready to run our models. First, I will split the data into train and test sets. I will be running a LOGISTIC REGRESSION model on both the murders and robberies datasets, both preprocessed specifically for this task. Then I will running my 3 classification models: KNN, Decision Tree, and Random forest. Finally, I will be running a confusion matrix for each model. I will only compare the 3 classification models to each other, using 3 evaluation metrics: Accuracy, Precision, and Recall. The logistic regression model is for exploration purposes only, for the sake of curiosity.
Next, I will use K-Folds cross validation for my train/test split, and re-run each model for both the MURDERS and ROBBERIES dataset. I will examine the confusion matrices and run evaluation metrics again.
MURDERS DATASET:
#Split the data into train test split
from sklearn.model_selection import train_test_split
#Split the dataset into training set and test set
#Our class column is murdPerPop_class_target, everything else will be used as features
class_murd_colname = 'murdPerPop_class_target'
feature_murd_names = murdclasscopy.columns[murdclasscopy.columns != class_murd_colname]
#70% training and 30% test
x1_train, x1_test, y1_train, y1_test = train_test_split(murdclasscopy.loc[:, feature_murd_names],
murdclasscopy[class_murd_colname], test_size=0.3, random_state=42)
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(class_weight= 'balanced')
logmodel.fit(x1_train, y1_train)
predictions_log = logmodel.predict(x1_test)
predictions_log
#KNN ALGORITHM
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x1_train, y1_train)
murd_pred = classifier.predict(x1_test)
murd_pred
#DECISION TREE
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#create and train the model
clf = DecisionTreeClassifier(max_depth=5, random_state=1234)
model = clf.fit(x1_train, y1_train)
#Predict the response for test dataset
murd_treepred = clf.predict(x1_test)
murd_treepred
from sklearn.tree import plot_tree
#plot tree (visual representation)
features = feature_murd_names
classes = class_murd_colname
plt.figure(figsize=(40, 40))
plot_tree(clf,
fontsize=10,
feature_names=features,
class_names=classes,
rounded=True,
filled=True,
proportion=True);
#plot tree (textual representation)
text_representation = tree.export_text(clf)
print(text_representation)
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
randf=RandomForestClassifier()
randf.fit(x1_train, y1_train)
murd_rf_pred = randf.predict(x1_test)
murd_rf_pred
#CONFUSION MATRICES
from sklearn.metrics import confusion_matrix
log_cm = confusion_matrix(y1_test, predictions_log)
knn_cm = confusion_matrix(y1_test, murd_pred)
dt_cm = confusion_matrix(y1_test, murd_treepred)
rf_cm = confusion_matrix(y1_test, murd_rf_pred)
print('LOGISTIC REGRESSION CONFUSION MATRIX')
print(log_cm)
print('KNN CONFUSION MATRIX')
print(knn_cm)
print('DECISION TREE CONFUSION MATRIX')
print(dt_cm)
print('RANDOM FOREST CONFUSION MATRIX')
print(rf_cm)
LOGISTIC REGRESSION CONFUSION MATRIX [[206 53] [ 73 244]] KNN CONFUSION MATRIX [[183 76] [ 72 245]] DECISION TREE CONFUSION MATRIX [[197 62] [ 74 243]] RANDOM FOREST CONFUSION MATRIX [[195 64] [ 74 243]]
#EVALUATION METRICS
log_accuracy = (log_cm[0][0] + log_cm[1][1])/(len(y1_test))
log_precision = log_cm[0][0]/(log_cm[1][0] + log_cm[0][0])
log_recall = (log_cm[0][0]/(log_cm[0][0] + log_cm[0][1]))
knn_accuracy = (knn_cm[0][0] + knn_cm[1][1])/(len(y1_test))
knn_precision = knn_cm[0][0]/(knn_cm[1][0] + knn_cm[0][0])
knn_recall = (knn_cm[0][0]/(knn_cm[0][0] + knn_cm[0][1]))
dt_accuracy = (dt_cm[0][0] + dt_cm[1][1])/(len(y1_test))
dt_precision = dt_cm[0][0]/(dt_cm[1][0] + dt_cm[0][0])
dt_recall = (dt_cm[0][0]/(dt_cm[0][0] + dt_cm[0][1]))
rf_accuracy = (rf_cm[0][0] + rf_cm[1][1])/(len(y1_test))
rf_precision = rf_cm[0][0]/(rf_cm[1][0] + rf_cm[0][0])
rf_recall = (rf_cm[0][0]/(rf_cm[0][0] + rf_cm[0][1]))
print('ACCURACY')
print('Logistic Regression: ACCURACY=', log_accuracy)
print('KNN Classifier: ACCURACY=', knn_accuracy)
print('Decision Tree: ACCURACY=', dt_accuracy)
print('Random Forest: ACCURACY=', rf_accuracy)
print('PRECISION')
print('Logistic Regression: PRECISION=', log_precision)
print('KNN Classifier: PRECISION=', knn_precision)
print('Decision Tree: PRECISION=', dt_precision)
print('Random Forest: PRECISION=', rf_accuracy)
print('RECALL')
print('Logistic Regression: RECALL=', log_recall)
print('KNN Classifier: RECALL=', knn_recall)
print('Decision Tree: RECALL=', dt_recall)
print('Random Forest: RECALL=', rf_accuracy)
ACCURACY Logistic Regression: ACCURACY= 0.78125 KNN Classifier: ACCURACY= 0.7430555555555556 Decision Tree: ACCURACY= 0.7638888888888888 Random Forest: ACCURACY= 0.7604166666666666 PRECISION Logistic Regression: PRECISION= 0.7383512544802867 KNN Classifier: PRECISION= 0.7176470588235294 Decision Tree: PRECISION= 0.7269372693726938 Random Forest: PRECISION= 0.7604166666666666 RECALL Logistic Regression: RECALL= 0.7953667953667953 KNN Classifier: RECALL= 0.7065637065637066 Decision Tree: RECALL= 0.7606177606177607 Random Forest: RECALL= 0.7604166666666666
MURDERS: K-FOLD CROSS VALIDATION
# define dataset
X, y = murdclasscopy.loc[:, feature_murd_names], murdclasscopy[class_murd_colname]
# summarize the dataset
print(X.shape, y.shape)
(1919, 50) (1919,)
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=42, shuffle=True)
# evaluate a logistic regression model using k-fold cross-validation
# create model
log_model = LogisticRegression()
# evaluate model
log_scores_acc = cross_val_score(log_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
log_scores_pre = cross_val_score(log_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
log_scores_re = cross_val_score(log_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(log_scores_acc), std(log_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(log_scores_pre), std(log_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(log_scores_re), std(log_scores_re)))
Accuracy: 0.776 (0.029) Precision: 0.785 (0.033) Recall: 0.806 (0.040)
# evaluate a knn model using k-fold cross-validation
# create model
knn_model = KNeighborsClassifier()
# evaluate model
knn_scores_acc = cross_val_score(knn_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
knn_scores_pre = cross_val_score(knn_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
knn_scores_re = cross_val_score(knn_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(knn_scores_acc), std(knn_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(knn_scores_pre), std(knn_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(knn_scores_re), std(knn_scores_re)))
Accuracy: 0.752 (0.028) Precision: 0.759 (0.030) Recall: 0.795 (0.036)
# evaluate a decision tree model using k-fold cross-validation
# create model
dt_model = DecisionTreeClassifier(max_depth=5)
# evaluate model
dt_scores_acc = cross_val_score(dt_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
dt_scores_pre = cross_val_score(dt_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
dt_scores_re = cross_val_score(dt_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(dt_scores_acc), std(dt_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(dt_scores_pre), std(dt_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(dt_scores_re), std(dt_scores_re)))
Accuracy: 0.727 (0.029) Precision: 0.767 (0.042) Recall: 0.713 (0.040)
# evaluate a random forest model using k-fold cross-validation
# create model
rf_model = RandomForestClassifier()
# evaluate model
rf_scores_acc = cross_val_score(rf_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
rf_scores_pre = cross_val_score(rf_model, X, y, scoring='precision', cv=cv, n_jobs=-1)
rf_scores_re = cross_val_score(rf_model, X, y, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rf_scores_acc), std(rf_scores_acc)))
print('Precision: %.3f (%.3f)' % (mean(rf_scores_pre), std(rf_scores_pre)))
print('Recall: %.3f (%.3f)' % (mean(rf_scores_re), std(rf_scores_re)))
Accuracy: 0.768 (0.028) Precision: 0.794 (0.030) Recall: 0.786 (0.032)
ROBBERIES DATASET:
#Create a copy of the dataset
robclasscopy = rob_class_copy.copy()
robclasscopy.head()
| state | pop_bins | householdsize_norm | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_norm | agePct65up_norm | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | robbbPerPop_class_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NJ | 10000-13500 | 0.407609 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.158370 | 0.189200 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 0.372916 | 0.096037 | 0.090802 | 0.222845 | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | Unlikely |
| 1 | PA | 19000-29000 | 0.331522 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 0.129065 | 0.303659 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 0.381438 | 0.117016 | 0.123821 | 0.247899 | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | Unlikely |
| 2 | OR | 29000-51500 | 0.225543 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.136090 | 0.168656 | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 0.237681 | 0.172960 | 0.469929 | 0.213508 | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Very Likely |
| 4 | MO | 515000-7500000 | 0.230978 | 0.9-2.8% | 90-96% | 0.6-1.2% | 0.1-0.9% | 0.271176 | 0.226961 | 100% | $8,000-$24,000 | 0.7-1.0% | 0.401525 | 5-8% | 0.252794 | 6500-10000 | 10000-15000 | 8500-12500 | 5500-8000 | 7500-1000 | 17-60% | 8-12% | 19-30 | 0.255523 | 0.336050 | 0.311189 | 0.546580 | 0.330843 | 67-70% | 0.606452 | 18-65% | 0.2-0.5% | 0.530849 | 0.543238 | 0-2.5 | 91-94% | 1.75-4% | 0.292368 | 1964-1971 | 0.3-0.5% | 0-140 | 0.569307 | 0.172043 | 10-12% | 50-65% | 0.456466 | 85-90% | 26-4000 | 1200-2000 | 0.5-50% | Likely |
| 5 | MA | 19000-29000 | 0.271739 | 0.9-2.8% | 96-100% | 1.2-2.6% | 0.9-2.2% | 0.132276 | 0.249658 | 100% | 42,000-$125,000 | 0-0.5% | 0.483379 | 5-8% | 0.327467 | 15000-250000 | 15000-500000 | 12500-17500 | 11500-137000 | 13500-55000 | 0-5% | 0-5% | 19-30 | 0.249479 | 0.342349 | 0.166200 | 0.226415 | 0.373327 | 75-82% | 0.641070 | 7-12% | 0.5-1.3% | 0.613235 | 0.479952 | 2.5-4 | 94-96% | 0.75-1.75% | 0.396532 | 1939-1956 | 0.16-0.3% | 230-805 | 0.470297 | 0.360215 | 12-13% | 75-95% | 0.773565 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | Unlikely |
#Encode the cateogorical variables with LABEL ENCODING METHOD
#This code block is a test run, encoding the column 'state'
#If successful, the rest of the categorical variables will be encoded the same way
#perform label encoding on 'team' column
robclasscopy['state'] = lab.fit_transform(robclasscopy['state'])
robclasscopy.head(3)
| state | pop_bins | householdsize_norm | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_norm | agePct65up_norm | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | robbbPerPop_class_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 10000-13500 | 0.407609 | 0.9-2.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.158370 | 0.189200 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.764559 | 0-3% | 0.355054 | 10000-15000 | 0-6500 | 17500-106500 | 0-5500 | 13500-55000 | 0-5% | 5-8% | 30-80 | 0.262609 | 0.372916 | 0.096037 | 0.090802 | 0.222845 | 82-95% | 0.316916 | 7-12% | 0.5-1.3% | 0.937939 | 0.086693 | 2.5-4 | 96-99% | 1.75-4% | 0.435135 | 1956-1964 | 0.16-0.3% | 230-805 | 0.440594 | 0.376344 | 13-14% | 50-65% | 0.801139 | 85-90% | 0-7 | 1200-2000 | 0.5-50% | Unlikely |
| 1 | 32 | 19000-29000 | 0.331522 | 0-0.8% | 90-96% | 2.7-57.5% | 0.1-0.9% | 0.129065 | 0.303659 | 100% | 42,000-$125,000 | 1.0-7.0% | 0.688453 | 0-3% | 0.461118 | 15000-250000 | 0-6500 | 17500-106500 | 0-5500 | 1000-13500 | 0-5% | 5-8% | 19-30 | 0.212797 | 0.381438 | 0.117016 | 0.123821 | 0.247899 | 82-95% | 0.420456 | 0-7% | 0.2-0.5% | 0.908541 | 0.222583 | 2.5-4 | 96-99% | 0-0.75% | 0.192507 | 1956-1964 | 0-0.16% | 170-230 | 0.628713 | 0.354839 | 12-13% | 75-95% | 0.890754 | 93-100% | 7-14 | 2000-3300 | 0.5-50% | Unlikely |
| 2 | 31 | 29000-51500 | 0.225543 | 0-0.8% | 90-96% | 2.7-57.5% | 0.9-2.2% | 0.136090 | 0.168656 | 100% | $32,000-$42,000 | 1.0-7.0% | 0.583729 | 0-3% | 0.263971 | 15000-250000 | 15000-500000 | 12500-17500 | 5500-8000 | 7500-1000 | 0-5% | 0-5% | 30-80 | 0.289704 | 0.237681 | 0.172960 | 0.469929 | 0.213508 | 75-82% | 0.655547 | 12-18% | 0.5-1.3% | 0.607912 | 0.481144 | 2.5-4 | 94-96% | 0.75-1.75% | 0.055942 | 1971-1987 | 0-0.16% | 140-170 | 0.455446 | 0.408602 | 10-12% | 0-50% | 0.371197 | 32-85% | 7-14 | 2000-3300 | 0.5-50% | Very Likely |
#Encode the rest of the categorical variables with for loop function
for r_cols in robclasscopy.columns:
if robclasscopy[r_cols].dtype == 'category':
robclasscopy[r_cols] = lab.fit_transform(robclasscopy[r_cols])
else:
pass
robclasscopy.head()
| state | pop_bins | householdsize_norm | racepctblack_bins | racePctWhite_bins | racePctAsian_bins | racePctHisp_bins | agePct12t21_norm | agePct65up_norm | pctUrban_bins | medIncome_bins | pctWFarmSelf_bins | pctWInvInc_norm | pctWPubAsst_bins | pctWRetire_norm | blackPerCap_bins | indianPerCap_bins | AsianPerCap_bins | OtherPerCap_bins | HispPerCap_bins | PctPopUnderPov_bins | PctLess9thGrade_bins | PctBSorMore_bins | PctEmplManu_norm | PctEmplProfServ_norm | PctOccupManu_norm | MalePctDivorce_norm | MalePctNevMarr_norm | PctFam2Par_bins | PctWorkMomYoungKids_norm | PctImmigRecent_bins | PctRecentImmig_bins | PctPersOwnOccup_norm | PctHousLess3BR_norm | MedNumBR_bins | PctHousOccup_bins | PctVacantBoarded_bins | PctVacMore6Mos_norm | MedYrHousBuilt_bins | PctWOFullPlumb_bins | RentQrange_bins | MedRentPctHousInc_norm | MedOwnCostPctInc_norm | MedOwnCostPctIncNoMtg_bins | PctBornSameState_bins | PctSameHouse85_norm | PctSameState85_bins | LandArea_bins | PopDens_bins | PctUsePubTrans_bins | robbbPerPop_class_target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 0 | 0.407609 | 1 | 2 | 3 | 1 | 0.158370 | 0.189200 | 2 | 3 | 3 | 0.764559 | 0 | 0.355054 | 1 | 0 | 2 | 0 | 2 | 0 | 2 | 3 | 0.262609 | 0.372916 | 0.096037 | 0.090802 | 0.222845 | 3 | 0.316916 | 3 | 2 | 0.937939 | 0.086693 | 1 | 3 | 2 | 0.435135 | 1 | 1 | 3 | 0.440594 | 0.376344 | 2 | 1 | 0.801139 | 1 | 0 | 1 | 1 | 1 |
| 1 | 32 | 2 | 0.331522 | 0 | 2 | 3 | 0 | 0.129065 | 0.303659 | 2 | 3 | 3 | 0.688453 | 0 | 0.461118 | 2 | 0 | 2 | 0 | 1 | 0 | 2 | 2 | 0.212797 | 0.381438 | 0.117016 | 0.123821 | 0.247899 | 3 | 0.420456 | 0 | 1 | 0.908541 | 0.222583 | 1 | 3 | 0 | 0.192507 | 1 | 0 | 2 | 0.628713 | 0.354839 | 1 | 3 | 0.890754 | 3 | 3 | 2 | 1 | 1 |
| 2 | 31 | 3 | 0.225543 | 0 | 2 | 3 | 1 | 0.136090 | 0.168656 | 2 | 1 | 3 | 0.583729 | 0 | 0.263971 | 2 | 2 | 1 | 2 | 3 | 0 | 0 | 3 | 0.289704 | 0.237681 | 0.172960 | 0.469929 | 0.213508 | 2 | 0.655547 | 1 | 2 | 0.607912 | 0.481144 | 1 | 2 | 1 | 0.055942 | 3 | 0 | 1 | 0.455446 | 0.408602 | 0 | 0 | 0.371197 | 0 | 3 | 2 | 1 | 2 |
| 4 | 20 | 4 | 0.230978 | 1 | 2 | 1 | 0 | 0.271176 | 0.226961 | 2 | 2 | 2 | 0.401525 | 2 | 0.252794 | 3 | 1 | 3 | 2 | 3 | 2 | 3 | 2 | 0.255523 | 0.336050 | 0.311189 | 0.546580 | 0.330843 | 1 | 0.606452 | 2 | 1 | 0.530849 | 0.543238 | 0 | 1 | 2 | 0.292368 | 2 | 2 | 0 | 0.569307 | 0.172043 | 0 | 1 | 0.456466 | 1 | 2 | 1 | 1 | 0 |
| 5 | 16 | 2 | 0.271739 | 1 | 3 | 2 | 1 | 0.132276 | 0.249658 | 2 | 3 | 0 | 0.483379 | 2 | 0.327467 | 2 | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 0.249479 | 0.342349 | 0.166200 | 0.226415 | 0.373327 | 2 | 0.641070 | 3 | 2 | 0.613235 | 0.479952 | 1 | 2 | 1 | 0.396532 | 0 | 1 | 3 | 0.470297 | 0.360215 | 1 | 3 | 0.773565 | 3 | 3 | 2 | 1 | 1 |
#check info of our dataframe to ensure it is of the correct type, and that all of the categorical attributes were transformed
robclasscopy.info()
#Split the data into train test split
from sklearn.model_selection import train_test_split
#Split the dataset into training set and test set
#Our class column is murdPerPop_class_target, everything else will be used as features
class_rob_colname = 'robbbPerPop_class_target'
feature_rob_names = robclasscopy.columns[robclasscopy.columns != class_rob_colname]
#70% training and 30% test
x2_train, x2_test, y2_train, y2_test = train_test_split(robclasscopy.loc[:, feature_rob_names],
robclasscopy[class_rob_colname], test_size=0.3, random_state=42)
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
rob_logmodel = LogisticRegression(class_weight= 'balanced')
logmodel.fit(x2_train, y2_train)
rob_pred_log = logmodel.predict(x2_test)
rob_pred_log
#KNN ALGORITHM
from sklearn.neighbors import KNeighborsClassifier
rob_classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x2_train, y2_train)
rob_pred = classifier.predict(x2_test)
rob_pred
#DECISION TREE
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#create and train the model
r_clf = DecisionTreeClassifier(max_depth=5, random_state=1234)
r_model = r_clf.fit(x2_train, y2_train)
#Predict the response for test dataset
rob_treepred = r_clf.predict(x2_test)
rob_treepred
from sklearn.tree import plot_tree
#plot tree (visual representation)
r_features = feature_rob_names
r_classes = class_rob_colname
plt.figure(figsize=(40, 40))
plot_tree(r_clf,
fontsize=10,
feature_names=r_features,
class_names=r_classes,
rounded=True,
filled=True,
proportion=True);
#plot tree (textual representation)
text_representation = tree.export_text(r_clf)
print(text_representation)
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
r_randf=RandomForestClassifier()
r_randf.fit(x2_train, y2_train)
rob_rf_pred = r_randf.predict(x1_test)
rob_rf_pred
#CONFUSION MATRICES
rlog_cm = confusion_matrix(y2_test, rob_pred_log)
rknn_cm = confusion_matrix(y2_test, rob_pred)
rdt_cm = confusion_matrix(y2_test, rob_treepred)
rrf_cm = confusion_matrix(y2_test, rob_rf_pred)
print('LOGISTIC REGRESSION CONFUSION MATRIX')
print(rlog_cm)
print('KNN CONFUSION MATRIX')
print(rknn_cm)
print('DECISION TREE CONFUSION MATRIX')
print(rdt_cm)
print('RANDOM FOREST CONFUSION MATRIX')
print(rrf_cm)
LOGISTIC REGRESSION CONFUSION MATRIX [[112 52 34] [ 43 141 3] [ 31 4 156]] KNN CONFUSION MATRIX [[100 56 42] [ 55 125 7] [ 37 6 148]] DECISION TREE CONFUSION MATRIX [[120 43 35] [ 64 119 4] [ 32 5 154]] RANDOM FOREST CONFUSION MATRIX [[127 41 30] [ 39 147 1] [ 27 2 162]]
#EVALUATION METRICS
from sklearn import metrics
rlog_accuracy = (metrics.accuracy_score(y2_test, rob_pred_log))
rlog_precision = (metrics.precision_score(y2_test, rob_pred_log, average='macro'))
rlog_recall = (metrics.recall_score(y2_test, rob_pred_log, average='macro'))
rknn_accuracy = (metrics.accuracy_score(y2_test, rob_pred))
rknn_precision = (metrics.precision_score(y2_test, rob_pred, average='macro'))
rknn_recall = (metrics.recall_score(y2_test, rob_pred, average='macro'))
rdt_accuracy = (metrics.accuracy_score(y2_test, rob_treepred))
rdt_precision = (metrics.precision_score(y2_test, rob_treepred, average='macro'))
rdt_recall = (metrics.recall_score(y2_test, rob_treepred, average='macro'))
rrf_accuracy = (metrics.accuracy_score(y2_test, rob_rf_pred))
rrf_precision = (metrics.precision_score(y2_test, rob_rf_pred, average='macro'))
rrf_recall = (metrics.recall_score(y2_test, rob_rf_pred, average='macro'))
print('ACCURACY')
print('Logistic Regression: ACCURACY=', rlog_accuracy)
print('KNN Classifier: ACCURACY=', rknn_accuracy)
print('Decision Tree: ACCURACY=', rdt_accuracy)
print('Random Forest: ACCURACY=', rrf_accuracy)
print('PRECISION')
print('Logistic Regression: PRECISION=', rlog_precision)
print('KNN Classifier: PRECISION=', rknn_precision)
print('Decision Tree: PRECISION=', rdt_precision)
print('Random Forest: PRECISION=', rrf_accuracy)
print('RECALL')
print('Logistic Regression: RECALL=', rlog_recall)
print('KNN Classifier: RECALL=', rknn_recall)
print('Decision Tree: RECALL=', rdt_recall)
print('Random Forest: RECALL=', rrf_accuracy)
ACCURACY Logistic Regression: ACCURACY= 0.7100694444444444 KNN Classifier: ACCURACY= 0.6475694444444444 Decision Tree: ACCURACY= 0.6822916666666666 Random Forest: ACCURACY= 0.7569444444444444 PRECISION Logistic Regression: PRECISION= 0.7087255778946533 KNN Classifier: PRECISION= 0.6468505222424303 Decision Tree: PRECISION= 0.688685955664951 Random Forest: PRECISION= 0.7569444444444444 RECALL Logistic Regression: RECALL= 0.7121403958484341 KNN Classifier: RECALL= 0.6494562709530373 Decision Tree: RECALL= 0.6829023216457771 Random Forest: RECALL= 0.7569444444444444
ROBBERIES: K-FOLD CROSS VALIDATION
# define dataset
X1, y1 = robclasscopy.loc[:, feature_rob_names], robclasscopy[class_rob_colname]
# summarize the dataset
print(X1.shape, y1.shape)
(1919, 50) (1919,)
# evaluate a logistic regression model using k-fold cross-validation
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=42, shuffle=True)
# create model
log_model = LogisticRegression()
# evaluate model
rlog_scores_acc = cross_val_score(log_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rlog_scores_pre = cross_val_score(log_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rlog_scores_re = cross_val_score(log_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rlog_scores_acc), std(rlog_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_pred_log, average = 'macro')))
print('Recall:',(metrics.recall_score(y2_test, rob_pred_log, average = 'macro')))
Accuracy: 0.705 (0.042) Precision: 0.7087255778946533 Recall: 0.7121403958484341
# evaluate a knn model using k-fold cross-validation
# create model
knn_model = KNeighborsClassifier()
# evaluate model
rknn_scores_acc = cross_val_score(knn_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rknn_scores_pre = cross_val_score(knn_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rknn_scores_re = cross_val_score(knn_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rknn_scores_acc), std(rknn_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_pred, average = 'macro')))
print('Recall:',(metrics.recall_score(y2_test, rob_pred, average = 'macro')))
Accuracy: 0.646 (0.031) Precision: 0.6468505222424303 Recall: 0.6494562709530373
# evaluate a decision tree model using k-fold cross-validation
# create model
dt_model = DecisionTreeClassifier(max_depth=5)
# evaluate model
rdt_scores_acc = cross_val_score(dt_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rdt_scores_pre = cross_val_score(dt_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rdt_scores_re = cross_val_score(dt_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rdt_scores_acc), std(rdt_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_treepred, average = 'macro')))
print('Recall:', (metrics.recall_score(y2_test, rob_treepred, average = 'macro')))
Accuracy: 0.663 (0.027) Precision: 0.688685955664951 Recall: 0.6829023216457771
# evaluate a random forest model using k-fold cross-validation
# create model
rf_model = RandomForestClassifier()
# evaluate model
rrf_scores_acc = cross_val_score(rf_model, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
rrf_scores_pre = cross_val_score(rf_model, X1, y1, scoring='precision', cv=cv, n_jobs=-1)
rrf_scores_re = cross_val_score(rf_model, X1, y1, scoring='recall', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(rrf_scores_acc),std(rdt_scores_acc)))
print('Precision:',(metrics.precision_score(y2_test, rob_rf_pred, average = 'macro')))
print('Recall:', (metrics.recall_score(y2_test, rob_rf_pred, average = 'macro')))
Accuracy: 0.714 (0.026) Precision: 0.7570311789837287 Recall: 0.7585593124552164